13天搞定Python分布爬虫
谷歌无头浏览器
from selenium import webdriver options = webdriver.ChromeOptions() options.add_argument('--headless') #chrome = webdriver.Chrome(chrome_options=options) chrome = webdriver.Chrome() chrome.get('https://cn.bing.com/') chrome.find_element_by_id('sb_form_q').send_keys('python') chrome.find_element_by_id('sb_form_go').click() html = chrome.page_source print(html) #chrome.quit()
robots.txt查看是否可爬:
https://www.taobao.com/robots.txt
http://www.sogo.com/robots.txt
selenium控制浏览器滚动条的使用:
from selenium import webdriver from lxml import etree from time import sleep url = 'https://search.jd.com/Search?keyword=mac&enc=utf-8&wq=mac&pvid=9862d03c24e741c6a58079d004f5aabf' chrome = webdriver.Chrome() chrome.get(url) js = 'document.documentElement.scrollTop=100000' chrome.execute_script(js) sleep(3) html = chrome.page_source e = etree.HTML(html) prices = e.xpath('//div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()') names = e.xpath('//div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/em') print(len(names)) for name, price in zip(names, prices): print(name.xpath('string(.)'), ":", price) chrome.quit()
保存网页图片
import requests from fake_useragent import UserAgent from lxml import etree url = "https://tuchong.com/1485770/19399344/#image351010920" response = requests.get(url, headers={"User-Agent": UserAgent().chrome}) e = etree.HTML(response.text) img_urls = e.xpath('//article/img/@src') print(img_urls) for url in img_urls: response = requests.get(url, headers={"User-Agent": UserAgent().chrome}) img_name = url[url.rfind('/')+1:] with open('img/'+img_name, 'wb') as f: f.write(response.content)
37-双色球练习数据下载
XPath练习写反,排除,不含某个属性的xpath写法:
e.xpath('//tbody[@id="tdata"]/tr[not(@class)]') 说明:不含class属性的tr
插入数据库表时如果属性为自增,可以直接写0,如:
sql='insert into t_ball values(0,%s,%s,%s) #注,0表示此属性列是自增
cursor.execute(sql,[data_time, red_ball, blue_ball),在插入数据时自增属性列空着就好。
……