通过分析Ajax请求抓取今日头条街拍美图

1，首先找到我们要查看的网址

https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D

打开F12 的network 清除所有的请求，再点击'图集' 可以看到

于是我们可以知道此处是通过Ajax请求进行数据的加载的，分析Preview 我们知道返回的数据是json 数据，分析得到我们需要的是article_url 我们要对数据进行处理拿到这个数据，然后对数据进行发送请求拿到我们想要的图片列表

2.当我们拿到连接url后我们对此链接进行访问，我们看到的HTML页面的标签中并没有我们想要的图片列表url

于是我们对页面进行分析（搜索连接后的那串数字，或标题），得到

经分析得到我们要的url列表数据就在这串数据中，因为不是标签中的所以我们要舍弃BeautifulSoup,xpath等，选用正则在获取目标数据

分析之后我们可以拿到文章连接，图片列表连接，之后我们就可以对连接和图片进行保存和下载的操作了，数据比较少就选用脚本进行工作，考虑到scrapy的异步，我们也给脚本使用多线程进行处理

下面开始用代码来完成以上流程

1.获取分析的json数据

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# __author__ jingqi

import requests #首先导入requests 库
import urllib    #导入urllib 库

headers= {

    'x-requested-with': 'XMLHttpRequest',
    'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'

    }

def get_page_index(offset, keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '3',
        'from': 'gallery'
    }
    url = 'https://www.toutiao.com/search_content/?' + urllib.urlencode(data)  #urllib.urlencode 使用此方法对data数据进行编译

    try:   #加异常处理
        response = requests.get(url,headers=headers)  #加上headers 模拟浏览器
        if response.status_code == 200: #判断相应状态码
            return response.text 
        else: 
            return None
     except RequestException:
         print ('请求失败', url)
         return None

2，对返回的数据进行数据得到我们需要的数据

from json import loads

def parse_page_index(content):

    data = loads(content)  #通过loads方法将json数据转化为python的字典格式的数据
    if data and 'data' in data.keys():
        for item in data.get('data')[1:]:
            yield item.get('article_url')

3.拿到文章连接article_url 发送请求获得返回数据，对数据进行正则匹配获得目标数据

from requests.exceptions import RequestException  #导入请求异常

def get_page_detail(url):
    '''
    对请求状态码进行判断，对数据进行异常处理，返回页面数据
    '''
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
             return response.text
        else:
            return None
    except RequestException:
        print ('请求详情失败',url)
        return None

4，拿到页面数据进行正则匹配获得目标数据

from bs4 import BeautifulSoup as bs  #导入库并起别名

def parse_page_detail(html, url):
    soup = bs(html, 'lxml')
    title = soup.select('title')  #css 选择器

    reg = re.compile('gallery:(.*?)\n', re.S)  #re.S 表示全文匹配
    result = re.findall(reg, html)
#    print result, url

    try:
        cons = result[0].replace('JSON.parse', '').encode('utf-8')     #将Unicode装换为utf-8的string

        if cons and 'sub_images' in cons:
            url_list = loads(eval(cons)[0])['sub_images']   #用eval方法将字符串转化为字典
            images = [item.get('url') for item in url_list]
            #print images
            for image in images:

                download_image(image)
            return {
                'title': title,
                'images': images,
                'url': url,
                }
    except Exception:
        pass

以上就是获取到街拍图片列表连接

下面附上完整代码

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# __author__ jingqi

import requests
from requests.exceptions import RequestException
import re
import urllib
from json import loads
import os
import hashlib
import pymongo
from multiprocessing.dummy import Pool   #开启多进程引入进程池
from config import *


'''
创建Mongo对象
'''
client = pymongo.MongoClient(MONGO_URL, MONGO_PORT)
db = client[MONGO_DB]

headers = {
    'x-requested-with': 'XMLHttpRequest',
    'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'
}

def get_page_index(offset, keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '3',
        'from': 'gallery'
    }
    url = 'https://www.toutiao.com/search_content/?' + urllib.urlencode(data)
    try:
        response = requests.get(url,headers=headers)

        if response.status_code == 200:return response.text

        else:
            return None
    except RequestException:
        print ('请求失败', url)
        return None

def parse_page_index(content):

    data = loads(content)
    if data and 'data' in data.keys():
        for item in data.get('data')[1:]:
            yield item.get('article_url')

def get_page_detail(url):

    try:
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
             return response.text
             #result = parse_page_detail(response.content, url)

        else:
            return None
    except RequestException:
        print ('请求详情失败',url)
        return None

def parse_page_detail(html, url):
    #soup = bs(html, 'lxml')
    #title = soup.select('title')
    #print title
    reg = re.compile('gallery:(.*?)\n', re.S)
    result = re.findall(reg, html)
#    print result, url

    try:
        cons = result[0].replace('JSON.parse', '').encode('utf-8')     #将Unicode装换为utf-8的string

        if cons and 'sub_images' in cons:
            url_list = loads(eval(cons)[0])['sub_images']   #用eval方法将字符串转化为字典
            images = [item.get('url') for item in url_list]
            #print images
            for image in images:

                download_image(image)
            return {
                ##'title': title,
                'images': images,
                'url': url,
                }
    except Exception:
        pass

def download_image(url):
    try:
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            save_image(response.content, url)
            #return response.text
        else:
            return None
    except RequestException:
        print ('请求图片详情失败',url)
        return None

def save_image(content,url):

    print '正在下载',url
    file_path = '{0}/{1}.{2}'.format(os.getcwd(), hashlib.md5(content).hexdigest(), 'jpg') #os.getcwd()当前位置，

    if not os.path.exists(file_path):
         with open(file_path, 'wb') as f:
             f.write(content)
             f.close() 
def save_to_mongo(result): 
    print '正在准备存储到MongoDB中'
    print result 

    if db[MONGO_TABLE].insert(result): 
        print '正在存储到MongoDB中',result 
            return True
    return False 
def main(offset): 
    html = get_page_index(offset, KEYWORD)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        if html:
            result = parse_page_detail(html, url) 
            if result is None:
                print 'result is None' 
            else: save_to_mongo(result) 
if __name__ == '__main__': 
    main(0) 
    groups = [ x*20 for x in range(GROUP_START,GROUP_END + 1) ] #生成一个列表 
    pool = Pool() #使用多进程 
    pool.map(main, groups)

下面是配置文件

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# __author__ jingqi

MONGO_URL = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'

GROUP_START = 1
GROUP_END = 20

KEYWORD = '街拍'

展示部分图片

部分mongo中数据展示

posted @ 2018-02-27 20:07 随风无义阅读(403) 评论(0) 收藏举报

随风无义

通过分析Ajax请求抓取今日头条街拍美图

公告