分析Ajax请求并抓取今日头条街拍美图图集(进程池、MongoDB、二进制流文件、正则、requests)

流程如下:

#1 配置好MongoDB的依赖库 

#2 模拟搜索街拍的请求信息

#3 通过请求返回的json返回的url地址再次爬取

#4 爬取新的url地址,并爬取相关的图片地址

#5 获取url地址,并将爬取数据写至MongoDB,且通过二进制流下载下来,若文件相同,则通过md5判断

 

 

 

 

 

1、相关配置文件如下:

config.py

1 MONGO_URL = 'localhost:27017'
2 MONGO_DB = 'toutiao'
3 MONGO_TABLE = 'toutiao1'
4 
5 GROUP_START = 0
6 GROUP_END = 19
7 KEYWORD='街拍'

2、爬虫代码如下:

  1 import json
  2 
  3 from urllib.parse import urlencode
  4 from hashlib import md5  #导入MD5判断
  5 
  6 import os
  7 import pymongo
  8 import requests
  9 from requests.exceptions import RequestException
 10 import re
 11 from bs4 import BeautifulSoup as bs
 12 from config import *
 13 from multiprocessing import Pool
 14 
 15 #mongodb数据库连接
 16 client=pymongo.MongoClient(MONGO_URL)
 17 db=client[MONGO_DB]
 18 
 19 #保存在mongodb上
 20 def save_to_mongo(result):
 21     if db[MONGO_TABLE].insert(result):
 22         print('存储到MongoDB成功',result)
 23         return True
 24     return False
 25 
 26 
 27 #01
 28 def get_page_index(offset,keyword):#获取索引页的json
 29     data={
 30         'offset': offset,
 31         'format': 'json',
 32         'keyword':keyword,
 33         'autoload':'true',
 34         'count':20,
 35         'cur_tab':3
 36     }
 37     url='https://www.toutiao.com/search_content/?'+urlencode(data)#将字典对象转为请求参数
 38     try:
 39         response=requests.get(url)
 40         if response.status_code==200:
 41             return response.text
 42         return None
 43     except RequestException:
 44         print('请求索引页出错')
 45         return None
 46 
 47 #02
 48 def parse_page_index(html):#获取索引页过来的json中的url地址
 49     data=json.loads(html)#json字符串转换为json对象
 50     if 'data' in data.keys():
 51         for item in data.get('data'):
 52             yield item.get('article_url')
 53 
 54 #03
 55 def get_page_detail(url):#获取详情
 56     try:
 57         response = requests.get(url)
 58         if response.status_code == 200:
 59             return response.text
 60         return None
 61     except RequestException:
 62         print('请求详情页出错',url)
 63         return None
 64 
 65 #04
 66 def parse_page_detail(html,url):
 67     soup=bs(html,'lxml')
 68     title=soup.title.string
 69 
 70     image_pattren = re.search(r'gallery:.*?parse\("(.*?)"\),', html, re.S) #这里一定要加r,表示不转义
 71     image_pattren = re.sub(r'\\','',image_pattren.group(1))
 72     try:
 73         data=json.loads(image_pattren)
 74     except: #有些json需要修复
 75         image_pattren = "{" + re.search(r'("sub_images":\[\{.*?\}\]),"max', image_pattren).group(1) + "}"
 76         data=json.loads(image_pattren)
 77 
 78     if data and 'sub_images' in data.keys():
 79         sub_images = data.get('sub_images')
 80         images = [item.get('url') for item in sub_images]
 81         for img in images:
 82             download_img(img)
 83         return {
 84             'title':title,
 85             'url':url,
 86             'images':images
 87         }
 88 
 89 #05
 90 def download_img(url):#通过地址下载图片的二进制流
 91     print('正在下载',url)
 92     try:
 93         response=requests.get(url)
 94         if response.status_code==200:
 95             save_img(response.content)  #保存二进制流为图片
 96         return None
 97     except RequestException:
 98         print('请求图片出错',url)
 99         return None
100 
101 
102 #06
103 def save_img(content):
104     file_path='{0}/{1}.{2}'.format('d:/123',md5(content).hexdigest(),'jpg')
105     if not os.path.exists(file_path):
106         with open(file_path,'wb') as f:
107             f.write(content)
108 
109 
110 
111 def main(offset):
112     html=get_page_index(offset,KEYWORD)
113     urls=parse_page_index(html)
114     for url in urls:
115         html=get_page_detail(url)
116         result=parse_page_detail(html,url)
117         save_to_mongo(result)
118 
119 
120 
121 
122 if __name__=='__main__':
123     group=[x*20 for x in range(GROUP_START,GROUP_END+1)]
124     pool = Pool()
125     pool.map(main, group)

 

posted @ 2017-12-03 00:31  来呀快活吧  阅读(621)  评论(0编辑  收藏  举报
cs