13天搞定Python分布爬虫

谷歌无头浏览器

from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument('--headless')
#chrome = webdriver.Chrome(chrome_options=options)
chrome = webdriver.Chrome()
chrome.get('https://cn.bing.com/')
chrome.find_element_by_id('sb_form_q').send_keys('python')
chrome.find_element_by_id('sb_form_go').click()

html = chrome.page_source
print(html)
#chrome.quit()

robots.txt查看是否可爬:

https://www.taobao.com/robots.txt

http://www.sogo.com/robots.txt

http://baidu.com/robots.txt

 

selenium控制浏览器滚动条的使用:

from selenium import webdriver
from lxml import etree
from time import sleep

url = 'https://search.jd.com/Search?keyword=mac&enc=utf-8&wq=mac&pvid=9862d03c24e741c6a58079d004f5aabf'

chrome = webdriver.Chrome()
chrome.get(url)

js = 'document.documentElement.scrollTop=100000'
chrome.execute_script(js)
sleep(3)
html = chrome.page_source
e = etree.HTML(html)
prices = e.xpath('//div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()')
names = e.xpath('//div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/em')

print(len(names))
for name, price in zip(names, prices):
    print(name.xpath('string(.)'), ":", price)
chrome.quit()

 

保存网页图片

import requests
from fake_useragent import UserAgent
from lxml import etree

url = "https://tuchong.com/1485770/19399344/#image351010920"
response = requests.get(url, headers={"User-Agent": UserAgent().chrome})
e = etree.HTML(response.text)
img_urls = e.xpath('//article/img/@src')

print(img_urls)

for url in img_urls:
    response = requests.get(url, headers={"User-Agent": UserAgent().chrome})
    img_name = url[url.rfind('/')+1:]
    with open('img/'+img_name, 'wb') as f:
        f.write(response.content)

 

37-双色球练习数据下载

XPath练习写反,排除,不含某个属性的xpath写法:

e.xpath('//tbody[@id="tdata"]/tr[not(@class)]') 说明:不含class属性的tr

 

插入数据库表时如果属性为自增,可以直接写0,如:

sql='insert into t_ball values(0,%s,%s,%s)  #注,0表示此属性列是自增

cursor.execute(sql,[data_time, red_ball, blue_ball),在插入数据时自增属性列空着就好。

……

posted @ 2020-02-11 13:13  xiongjiawei  阅读(218)  评论(0)    收藏  举报