随笔分类 -  爬虫及Scrapy

摘要:import json import re import time import requests import multiprocessing class HandleLaGou(): def __init__(self): # 使用session保存cookies信息 self.lagou_session = requests.Session()... 阅读全文
posted @ 2019-08-13 19:51 Erick-LONG 阅读(503) 评论(0) 推荐(0)
摘要:import json import hashlib import time import requests from collections import OrderedDict import arrow import pandas as pd import pandas.io.formats.excel from xlsxwriter.utility import xl_rowcol_to... 阅读全文
posted @ 2019-07-01 16:32 Erick-LONG 阅读(566) 评论(0) 推荐(0)
摘要:# coding=utf-8 import requests import re from requests_html import HTMLSession import pandas as pd import time session = HTMLSession() headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone... 阅读全文
posted @ 2018-08-17 10:18 Erick-LONG 阅读(219) 评论(0) 推荐(0)
摘要:# coding=utf-8 import os import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support i... 阅读全文
posted @ 2018-07-30 13:57 Erick-LONG 阅读(1759) 评论(0) 推荐(0)
摘要:# _*_ coding=utf-8 _*_ import requests import time import math import os import pandas as pd cookies = input('请输入Cookie:') headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_1... 阅读全文
posted @ 2018-07-30 13:55 Erick-LONG 阅读(340) 评论(0) 推荐(0)
摘要:# start_chrome -> input_date -> scroll_down-> find_cards_info -> save -> find_next (goto) from selenium import webdriver from selenium.webdriver.common.keys import Keys import time import csv import ... 阅读全文
posted @ 2018-07-02 13:32 Erick-LONG
摘要:from selenium import webdriver def start_chrome(): driver = webdriver.Chrome(executable_path = './chromedriver') driver.start_client() return driver def find_strangers(): btn_sel =... 阅读全文
posted @ 2018-06-29 13:30 Erick-LONG 阅读(179) 评论(0) 推荐(0)
摘要:import requests import json import pandas as pd import time import re headers = { 'User-Agent': 'XXXX', 'Cookie': 'XXX'} def get_ad(page): url = 'https://m.weibo.cn/api/container/getIn... 阅读全文
posted @ 2018-05-18 18:12 Erick-LONG 阅读(520) 评论(1) 推荐(0)
摘要:#aio 爬虫,去重,入库 import asyncio import aiohttp import aiomysql import re from pyquery import PyQuery stoping = False start_url = 'http://www.jobbole.com/' waiting_urls = [] seen_urls = set() # url去重 -... 阅读全文
posted @ 2018-04-25 08:10 Erick-LONG 阅读(421) 评论(0) 推荐(0)
摘要:import requests,time from lxml import etree import win32api,win32con import winsound import pyttsx3 cookies = str(input('请输入cookies:')) def ring(): engine = pyttsx3.init() engine.say('傻逼,有户... 阅读全文
posted @ 2018-04-20 14:23 Erick-LONG 阅读(409) 评论(0) 推荐(0)
摘要:import requests import re import pandas as pd def get_all_date_url(): all_url=[] for i in range(61): url = 'http://club.xywy.com/keshi/{}.html'.format(str(i+1)) res = request... 阅读全文
posted @ 2018-01-18 18:51 Erick-LONG 阅读(613) 评论(0) 推荐(0)
摘要:import requests import json from dateutil.parser import parse import time headers = { 'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko)... 阅读全文
posted @ 2018-01-08 11:51 Erick-LONG 阅读(537) 评论(1) 推荐(1)
摘要:import requests import re,json import pandas class base(): def __init__(self,url): self.url = url def all_url(self): return [self.url + "%s" % i for i in range(1,100)] ... 阅读全文
posted @ 2017-12-07 19:13 Erick-LONG 阅读(1565) 评论(0) 推荐(0)
摘要:import trip headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36', 'Referer':'https://ad.weibo.com/ad/index... 阅读全文
posted @ 2017-11-10 11:52 Erick-LONG 阅读(228) 评论(0) 推荐(0)
摘要:# url管理器 # url管理器 import pickle import hashlib class UrlManager(): def __init__(self): self.new_urls = self.load_progress('new_urls.txt') # 未爬取url集合 self.old_urls = self.load_p... 阅读全文
posted @ 2017-10-23 22:06 Erick-LONG 阅读(482) 评论(0) 推荐(0)
摘要:# url管理器 class UrlManager(): def __init__(self): self.new_urls = set() #未爬取集合 self.old_urls = set() # 已爬取集合 def has_new_url(self): ''' 判断是否有未爬取的URL :... 阅读全文
posted @ 2017-10-22 18:32 Erick-LONG 阅读(287) 评论(0) 推荐(0)
摘要:import requests from lxml import etree import urllib url = 'http://www.ivsky.com/tupian/ziranfengguang/' def Schedule(blocknum,blocksize,totolsize): per = 100.0 * blocknum * blocksize / totolsi... 阅读全文
posted @ 2017-10-22 16:44 Erick-LONG 阅读(506) 评论(0) 推荐(0)
摘要:分布式进程可以有multiprocessing模块的managers子模块支持,可以写一个服务进程作为调度者,将任务分布到其他多个进程中,依靠网络通信进行管理 taskManager.py taskManager.py 阅读全文
posted @ 2017-10-22 11:39 Erick-LONG 阅读(580) 评论(0) 推荐(0)
摘要:import requests import json import time import random url = 'http://cm.admin.xxxx.com/customer/aj_addcontent.php' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/2010010... 阅读全文
posted @ 2017-08-18 17:56 Erick-LONG 阅读(316) 评论(0) 推荐(0)
摘要:# _*_coding:utf-8_*_ from selenium import webdriver import datetime import time driver = webdriver.Chrome(executable_path='chromedriver.exe') def login(uname, pwd): driver.get("http://www.... 阅读全文
posted @ 2017-08-18 17:54 Erick-LONG 阅读(5407) 评论(1) 推荐(0)