随笔分类 - 爬虫及Scrapy
摘要:import json import re import time import requests import multiprocessing class HandleLaGou(): def __init__(self): # 使用session保存cookies信息 self.lagou_session = requests.Session()...
阅读全文
摘要:import json import hashlib import time import requests from collections import OrderedDict import arrow import pandas as pd import pandas.io.formats.excel from xlsxwriter.utility import xl_rowcol_to...
阅读全文
摘要:# coding=utf-8 import requests import re from requests_html import HTMLSession import pandas as pd import time session = HTMLSession() headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone...
阅读全文
摘要:# coding=utf-8 import os import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support i...
阅读全文
摘要:# _*_ coding=utf-8 _*_ import requests import time import math import os import pandas as pd cookies = input('请输入Cookie:') headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_1...
阅读全文
摘要:# start_chrome -> input_date -> scroll_down-> find_cards_info -> save -> find_next (goto) from selenium import webdriver from selenium.webdriver.common.keys import Keys import time import csv import ...
阅读全文
posted @ 2018-07-02 13:32
Erick-LONG
摘要:from selenium import webdriver def start_chrome(): driver = webdriver.Chrome(executable_path = './chromedriver') driver.start_client() return driver def find_strangers(): btn_sel =...
阅读全文
摘要:import requests import json import pandas as pd import time import re headers = { 'User-Agent': 'XXXX', 'Cookie': 'XXX'} def get_ad(page): url = 'https://m.weibo.cn/api/container/getIn...
阅读全文
摘要:#aio 爬虫,去重,入库 import asyncio import aiohttp import aiomysql import re from pyquery import PyQuery stoping = False start_url = 'http://www.jobbole.com/' waiting_urls = [] seen_urls = set() # url去重 -...
阅读全文
摘要:import requests,time from lxml import etree import win32api,win32con import winsound import pyttsx3 cookies = str(input('请输入cookies:')) def ring(): engine = pyttsx3.init() engine.say('傻逼,有户...
阅读全文
摘要:import requests import re import pandas as pd def get_all_date_url(): all_url=[] for i in range(61): url = 'http://club.xywy.com/keshi/{}.html'.format(str(i+1)) res = request...
阅读全文
摘要:import requests import json from dateutil.parser import parse import time headers = { 'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko)...
阅读全文
摘要:import requests import re,json import pandas class base(): def __init__(self,url): self.url = url def all_url(self): return [self.url + "%s" % i for i in range(1,100)] ...
阅读全文
摘要:import trip headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36', 'Referer':'https://ad.weibo.com/ad/index...
阅读全文
摘要:# url管理器 # url管理器 import pickle import hashlib class UrlManager(): def __init__(self): self.new_urls = self.load_progress('new_urls.txt') # 未爬取url集合 self.old_urls = self.load_p...
阅读全文
摘要:# url管理器 class UrlManager(): def __init__(self): self.new_urls = set() #未爬取集合 self.old_urls = set() # 已爬取集合 def has_new_url(self): ''' 判断是否有未爬取的URL :...
阅读全文
摘要:import requests from lxml import etree import urllib url = 'http://www.ivsky.com/tupian/ziranfengguang/' def Schedule(blocknum,blocksize,totolsize): per = 100.0 * blocknum * blocksize / totolsi...
阅读全文
摘要:分布式进程可以有multiprocessing模块的managers子模块支持,可以写一个服务进程作为调度者,将任务分布到其他多个进程中,依靠网络通信进行管理 taskManager.py taskManager.py
阅读全文
摘要:import requests import json import time import random url = 'http://cm.admin.xxxx.com/customer/aj_addcontent.php' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/2010010...
阅读全文
摘要:# _*_coding:utf-8_*_ from selenium import webdriver import datetime import time driver = webdriver.Chrome(executable_path='chromedriver.exe') def login(uname, pwd): driver.get("http://www....
阅读全文

浙公网安备 33010602011771号