# 使用微信处理反爬抓取微信文章
# 一、引入模块
from pyquery import PyQuery as pq
import requests
from urllib.parse import urlencode
import pymongo
from config import *
# 参数设置
headers = {
'Cookie':'IPLOC=CN3100; SUID=3F9A2D651620940A00000000593501AD; SUV=1496646179483423; ABTEST=2|1496646063|v1; SNUID=63C771395C590C6C4DA62B9E5DA843E5; weixinIndexVisited=1; ppinf=5|1496647654|1497857254|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTclOEUlOEIlRTUlODUlODklRTYlOUQlQjB8Y3J0OjEwOjE0OTY2NDc2NTR8cmVmbmljazoyNzolRTclOEUlOEIlRTUlODUlODklRTYlOUQlQjB8dXNlcmlkOjQ0Om85dDJsdUJzVzdIYXl4M2tlWFpjZTdWWmNNX2NAd2VpeGluLnNvaHUuY29tfA; pprdig=xFuTU5F3rYPr-GxEdzubwrQZ7jX7ifkrTXkYt2AR7gz17xFKLcIlD5r91dsYOnH_RDub9VxG8vNpHf5fwEjxAs4qFEJTqW96oVvr1UZq3qXq-AhGxJEDqlo8g5O3ZXy_F80B8YndLpUVbWeQDfJFlrwBlQ-3PXME0lxEDeguSyY; sgid=08-28925681-AVk1BibbKicbvQn77cbUV9RKo; SUIR=63C771395C590C6C4DA62B9E5DA843E5; pgv_pvi=3861214208; pgv_si=s8617661440; PHPSESSID=3rs7b393svg890cc66tv5kp942; sct=3; JSESSIONID=aaaTpSE3q_s21beotLIXv; ppmdig=14967350130000006401a6de8aa6584a3ed50839343b064b; seccodeRight=success; successCount=2|Tue, 06 Jun 2017 07:50:44 GMT',
'Host':'weixin.sogou.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
proxy = None
# 二、请求解析模块
# 1、构造url,进行微信关键词搜索
def get_url(KEYWORDS, page_num):
base_url = 'http://weixin.sogou.com/weixin?'
data = {
'query': KEYWORDS,
'type': '2',
'page': page_num
}
data = urlencode(data)
url = base_url + data
return url
# 2、请求url,得到索引页html
def get_index_html(url):
print('正在爬取', url)
global proxy
try:
if proxy:
print('正在使用代理',proxy)
proxies = {
'http':'http://' + proxy
}
response = requests.get(url, headers = headers, allow_redirects = False,proxies = proxies)
else:
response = requests.get(url, headers=headers, allow_redirects=False)
if response.status_code == 200:
return response.text
if response.status_code == 302:
# Need proxy
print('302')
proxy = get_proxy()
if proxy:
return get_index_html(url)
else:
print('请求代理失败')
return None
except Exception:
proxy = get_proxy()
return get_index_html(url)
# 3、请求url过程中,可能会遇到反爬虫措施,这时候需要开启代理
def get_proxy():
print('正在请求代理')
try:
response = requests.get(POOL_PROXY_URL)
if response.status_code == 200:
return response.text
else:
print('请求代理失败')
return None
except Exception:
return None
# 4、分析索引页html代码,返回微信详情页url
def get_article_url(index_html):
doc = pq(index_html)
lis = doc('.news-box .news-list li').items()
for item in lis:
yield item.find('h3 a').attr('href')
# 5、请求微信详情页url,得到详情页html
def get_article_html(article_url):
try:
response = requests.get(article_url)
if response.status_code == 200:
return response.text
else:
return None
except ConnectionError:
return None
# 6、分析详情页html代码,得到微信标题、公众号、发布日期,文章内容等信息
def parse_article_html(article_html):
doc = pq(article_html)
article_data = {
'title':doc('#img-content .rich_media_title').text(),
'nickname':doc('.rich_media_meta_list .rich_media_meta_nickname').text(),
'date':doc('.rich_media_meta_list #post-date').text(),
'wechat':doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(),
'content':doc('.rich_media_content').text()
}
# print(article_data)
return article_data
# 三、存储模块
# 保存到数据库MongoDB
def save_to_mongo(article_data):
try:
if db[MONGO_TABLE].insert(article_data):
print('保存到MONGODB成功',article_data)
except Exception:
print('保存到MONGODB失败!',article_data)
# 四、调试模块
def main():
for page_num in range(1, 101):
index_url = get_url(KEYWORDS, page_num)
# print(index_url)
index_html = get_index_html(index_url)
if index_html:
article_urls = get_article_url(index_html)
for article_url in article_urls:
# print(article_url)
article_html = get_article_html(article_url)
if article_html:
article_data = parse_article_html(article_html)
print(article_data)
# save_to_mongo(article_data)
if __name__ == '__main__':
main()
#配置文件
MONGO_URL = 'localhost'
MONGO_DB = 'weixin'
MONGO_TABLE = 'article_data'
KEYWORDS= '风景'
POOL_PROXY_URL = 'http://127.0.0.1:5000/get'