Python开发之爬虫实战(续)
爬取淘宝商品
目标站点分析
- 搜索关键字,利用selenium驱动浏览器搜索关键字,得到查询后的商品列表
- 分析页码并翻页,得到商品页码数,模拟翻页,得到后续页码的商品列表
- 分析提取商品内容,利用PyQuery分析源码,解析得到商品列表
- 存储到MongoDB,将商品列表存储信息存储到数据库MongoDB。
爬虫代码
spider.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from pyquery import PyQuery as pq
from config import *
import pymongo
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
def search():
try:
browser.get('https://www.taobao.com/')
# 判断页面加载成功
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
)
# 提交按钮
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
input.send_keys(KEY_WORD)
submit.click()
# 等待美食这个页面加载成功,总页数
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
# 等待第一个页面加载完之后调用解析方法
get_products()
return total.text
except TimeoutError:
# 如果出现错误,再次发送请求
return search()
# 翻页处理
def next_page(page_number):
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
# 提交按钮
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
input.clear() # 先清除输入框的内容
input.send_keys(page_number)
submit.click()
# 判断是否翻页到指定页
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
# 翻页成功之后再调用
get_products()
except TimeoutError:
next_page(page_number)
# 解析
def get_products():
# 判断页面加载成功
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
html = browser.page_source # 拿到网页源代码
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'image':item.find('.pic .img').attr('src'),
'price':item.find('.price').text(),
'deal':item.find('.deal-cnt').text()[:-3],
'title':item.find('.title').text(),
'shop':item.find('.shop').text(),
'location':item.find('.location').text()
}
# print(product)
save_to_mongo(product)
# 保存到MongoDB数据库
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('保存到MongoDB成功',result)
except Exception:
print('存储到MongoDB失败',result)
def main():
try:
total = search()
# 得到总的页码数
total = int(re.findall('\d+',total)[0])
# print(total)
# 循环从第二页开始
for i in range(2,total + 1):
next_page(i)
except Exception:
print('出错了')
finally:
# main()结束关闭浏览器
browser.close()
if __name__ == '__main__':
main()
配置文件config.py
MONGO_URL='192.168.8.128' MONGO_DB='taobao' MONGO_TABLE = 'product' KEY_WORD='美食'
成功保存到MongoDB显示

用Flask+Redis维护代理池
为什么要用代理池?
因为许多网站有专门的反爬虫措施,可能遇到封IP等问题。还有就是互联网上公开了大量免费代理,要利用好这些资源。通过定时的检测维护同样可以得到多个可用代理。
代理池的要求?
- 多站抓取,异步检测
- 定时筛选,持续更新
- 提供接口,易于提取
代理池架构?

代理池的源码可参考github上的文章,详见https://github.com/Python3WebSpider/ProxyPool
使用代理处理反爬抓取微信文章
具体流程:
- 抓取索引页内容,利用requests请求目标站点,得到索引网页HTML代码,返回结果
- 代理设置,如果遇到302状态码,则证明IP被封,切换代理重试
- 分析详情页内容,请求详情页,分析得到标题,正文等内容
- 将数据保存到MongoDB数据库
import requests
from urllib.parse import urlencode
from lxml.etree import XMLSyntaxError
from requests.exceptions import ConnectionError
base_url = 'http://weixin.sogou.com/weixin?'
from pyquery import PyQuery as pq
import pymongo
client = pymongo.MongoClient('localhost')
db = client['weixin']
headers = {
'Cookie':'ABTEST=0|1534587111|v1; SNUID=C97E266C1D186DAFFAB826CE1E760218; IPLOC=CN4601; SUID=D4633B716E2F940A000000005B77F0E7; SUID=D4633B713320910A000000005B77F0E7; JSESSIONID=aaaNclsqE7zYShQvbDcvw; SUV=00E025A2713B63D45B77F0E8F8053932',
'Host':'weixin.sogou.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
keyword = '风景'
proxy_pool_url = 'http://127.0.0.1:5000/get'
proxy = None
max_count = 5
def get_proxy():
try:
response = requests.get(proxy_pool_url)
if response.status_code==200:
return response.text
return None
except ConnectionError:
return None
def get_html(url, count=1):
# 打印调试
print('Crawling',url)
print('Count',count)
global proxy
if count >= max_count:
print('Tried Too Many Counts')
return None
try:
if proxy:
proxies = {
'http':'http://' + proxy
}
response = requests.get(url,allow_redirects=False,headers=headers,proxies=proxies)
else:
response = requests.get(url, allow_redirects=False, headers=headers)
if response.status_code == 200:
return response.text
if response.status_code == 302:
# need proxy
print(302)
proxy = get_proxy()
if proxy:
print('Using Proxy',proxy)
return get_html(url)
else:
print('get proxy failed')
return None
except ConnectionError as e:
print('Error Occured',e.args)
proxy = get_proxy()
count += 1
return get_html(url,count)
def get_index(keyword, page):
data = {
'query':keyword,
'type':2,
'page':page
}
queries = urlencode(data) # 把data转化成get请求参数的形式
url = base_url+queries
html = get_html(url)
return html
# 解析部分
def parse_index(html):
doc = pq(html)
items = doc('.news-box .news-list li .txt-box h3 a').items()
for item in items:
yield item.attr('href')
def get_detail(url):
try:
response = requests.get(url)
if response.status_code==200:
return response.text
return None
except ConnectionError:
return None
def parse_detail(html):
try:
doc = pq(html)
title = doc('.rich_media_title').text()
content = doc('rich_media_content').text()
date = doc('#post-date').text()
nickname = doc('.profile_nickname').text()
wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
return {
'title':title,
'content':content,
'date':date,
'nickname':nickname,
'wechat':wechat
}
except XMLSyntaxError:
return None
def save_to_mongo(data):
if db['articles'].update({'title':data['title']},{'$set':data},True):
print('Saved to Monogo',data['title'])
else:
print('Saved to Monogo Failed',data['title'])
def main():
for page in range(1,101):
html = get_index(keyword, page)
if html:
article_urls = parse_index(html)
for article_url in article_urls:
article_html = get_detail(article_url)
if article_html:
article_data = parse_detail(article_html)
print(article_data)
if article_data:save_to_mongo(article_data)
if __name__ == '__main__':
main()
浙公网安备 33010602011771号