Python3用requests,multiprocessing多线程爬取今日头条图片

仅供交流学习

#coding=utf-8

import json
import requests
import re
import os
from multiprocessing import Pool
from urllib.parse import urlencode
from fake_useragent import UserAgent
from hashlib import md5
from bs4 import BeautifulSoup

ua=UserAgent()

keyword="街拍"

def get_page(offset):
    param={
        'offset': offset,
        'format':'json',
        'keyword':keyword,
        'autoload':'true',
        'count': 20
    }
    base="https://www.toutiao.com/api/search/content/?"
    url=base+urlencode(param)
    content=get_content(url)
    data=json.loads(content)
    if data and "data" in data.keys():
        article_list=data.get('data')
        return [item.get('article_url') for item in article_list]
    return None

#保存结果到文件
def write_to_file(content):
    with open("res.txt","a",encoding="utf-8") as f:
        f.write(content)
#解析获取内页的图片
def parse_page_image(url):
    content=get_content(url)
    if content!=None:
	    #获取标题
        soup=BeautifulSoup(content,'lxml')
        res=soup.select('title')
        title=res[0].get_text()

        pattern=re.compile(r'gallery: JSON.parse\("(.*?)"\),',re.S)
        items=pattern.findall(content)
        for item in items:
            item=eval("'{}'".format(item))
            data=json.loads(item)
            if data and "sub_images" in data.keys():
                items=[item.get("url") for item in data.get('sub_images')]
                res={
                    'title':title,
                    'imgList':items,
                    'url':url
                }
                write_to_file(json.dumps(res,ensure_ascii=False)+"\n")
                for url in items:
                    get_img(url)

#保存图片
def save_img(content):
    path_file="{0}/{1}/{2}.{3}".format(os.getcwd(),"img",md5(content).hexdigest(),"jpg")
    print(path_file)
    with open(path_file,"wb") as f:
        f.write(content)
#获取远程图片
def get_img(url):
    try:
        headers={'User-Agent':ua.chrome}
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            save_img(response.content)
    except:
        pass
#获取文本内容
def get_content(url):
    try:
        headers={'User-Agent':ua.chrome}
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except:
        return None

def main(offset):
    items=get_page(offset)
    if items!=None:
        for item in items:
            parse_page_image(item)

if __name__=='__main__':
    pool=Pool()
    pool.map(main,[i*10 for i in range(10)])
    pool.close()
    pool.join()

原文: https://rumenz.com/rumenbiji/python-requests-multiprocessing.html

posted @ 2019-10-21 22:22  入门小站  阅读(1976)  评论(0编辑  收藏  举报