随笔分类 -  爬虫实战模块

摘要:maoyan.py import scrapy class MaoyanSpider(scrapy.Spider): name = 'maoyan' allowed_domains = ['maoyan.com'] start_urls = ['https://maoyan.com/films?sh 阅读全文
posted @ 2020-08-09 11:53 kuanleung 阅读(7) 评论(0) 推荐(0)
摘要:python 非官方包下载地址 点击加载地址 https://www.lfd.uci.edu/~gohlke/pythonlibs/ 阅读全文
posted @ 2020-08-08 10:55 kuanleung 阅读(10) 评论(0) 推荐(0)
摘要:import requests from fake_useragent import UserAgent from lxml import etree #url管理 class URLManger(object): def __init__(self): self.new_url=[] self.o 阅读全文
posted @ 2020-08-08 10:38 kuanleung 阅读(9) 评论(0) 推荐(0)
摘要:import requests from fake_useragent import UserAgent from lxml import etree url='' header={ 'User-Agent' : UserAgent().Chrome } response= requests.get 阅读全文
posted @ 2020-08-08 10:37 kuanleung 阅读(7) 评论(0) 推荐(0)
摘要:from selenium import webdriver from lxml import etree from time import sleep from random import randint url ='https://search.jd.com/Search?keyword=%E6 阅读全文
posted @ 2020-08-02 18:12 kuanleung 阅读(12) 评论(0) 推荐(0)
摘要:from selenium import webdriver from time import sleep from lxml import etree from random import randint edge = webdriver.Edge() edge.get("https://www. 阅读全文
posted @ 2020-08-02 14:13 kuanleung 阅读(6) 评论(0) 推荐(0)
摘要:参考文章 下载的exe文件修改名字为MicrosoftWebDriver.exe 阅读全文
posted @ 2020-08-01 09:36 kuanleung 阅读(12) 评论(0) 推荐(0)
摘要:import requests from fake_useragent import UserAgent from time import sleep from random import randint from pyquery import PyQuery def get_html(url): 阅读全文
posted @ 2020-07-19 15:20 kuanleung 阅读(7) 评论(0) 推荐(0)
摘要:import requests from fake_useragent import UserAgent from time import sleep from random import randint import re def get_html(url): headers={ 'User-Ag 阅读全文
posted @ 2020-07-19 12:50 kuanleung 阅读(6) 评论(0) 推荐(0)
摘要:import requests from fake_useragent import UserAgent from time import sleep from random import randint from bs4 import BeautifulSoup def get_html(url) 阅读全文
posted @ 2020-07-19 11:40 kuanleung 阅读(7) 评论(0) 推荐(0)
摘要:import requests from fake_useragent import UserAgent from lxml import etree from time import sleep from random import randint def get_html(url): heade 阅读全文
posted @ 2020-07-18 19:22 kuanleung 阅读(7) 评论(0) 推荐(0)
摘要:from selenium import webdriver d=webdriver.Edge() d.get('https://www.baidu.com/') 阅读全文
posted @ 2020-07-17 21:15 kuanleung 阅读(8) 评论(0) 推荐(0)
摘要:import requests from lxml import etree from fake_useragent import UserAgent url = 'https://tech.163.com/20/0716/07/FHL0LPK300097U7T.html' headers={ 'U 阅读全文
posted @ 2020-07-17 12:49 kuanleung 阅读(14) 评论(0) 推荐(0)
摘要:from threading import Thread from queue import Queue from fake_useragent import UserAgent import requests from lxml import etree # 爬虫类 class CrawlInfo 阅读全文
posted @ 2020-07-16 11:01 kuanleung 阅读(8) 评论(0) 推荐(0)
摘要:from jsonpath import jsonpath import requests url='' headers={ 'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGec 阅读全文
posted @ 2020-07-15 16:45 kuanleung 阅读(10) 评论(0) 推荐(0)
摘要:from pyquery import PyQuery as pq import requests url='https://ip.jiangxianli.com/?page=1' headers={ 'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1; 阅读全文
posted @ 2020-07-15 12:14 kuanleung 阅读(11) 评论(0) 推荐(0)
摘要:from lxml import etree import requests url = 'https://www.qidian.com/all' headers={ 'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit 阅读全文
posted @ 2020-07-14 21:53 kuanleung 阅读(7) 评论(0) 推荐(0)
摘要:import re str = 'I Study python3.7 everyday' print('--'*50) m1 = re.match(r'.',str) print(m1.group()) #m2=re.search(r'S\w+',str) #print(m2.group()) m3 阅读全文
posted @ 2020-07-14 18:25 kuanleung 阅读(4) 评论(0) 推荐(0)
摘要:from bs4 import BeautifulSoup str = ''' <title>尚学堂</title> <div class='info' float='left'>Welcome to SXT</div> <div class='info' float='right'> <span> 阅读全文
posted @ 2020-07-14 18:21 kuanleung 阅读(8) 评论(0) 推荐(0)
摘要:import requests session=requests.Session() header = { 'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Versio 阅读全文
posted @ 2020-07-09 12:40 kuanleung 阅读(7) 评论(0) 推荐(0)