爬虫

import requests
from requests.exceptions import RequestException
import re
from multiprocessing import Pool


def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
        }
        reponse = requests.get(url, headers=headers)
        if reponse.status_code == 200:
            return reponse.text
        return None
    except RequestException:
        return None


def prase_one_page(html):
    pattern = re.compile(
        '<dd>.*?board-index.*?>(.*?)</i>.*?title="(.*?)".*?data-src="(.*?)".*?"star">(.*?)</p>.*?time">(.*?)</p>', re.S)
    items = re.findall(pattern, html)
    for items in items:
        yield{
            'index': items[0],
            'image': items[1],
            'title': items[2],
            'actor': items[3].strip()[3:],
            'time': items[4].strip()[5:]
        }


def main(offset):
    url = 'https://maoyan.com/board/6?offset='+str(offset)
    html = get_one_page(url)
    for item in prase_one_page(html):
        print(item)


if __name__ == '__main__':
    pool = Pool()
    pool.map(main, [i * 10 for i in range(10)])

 

posted @ 2019-10-05 20:15  希望の曙光  阅读(96)  评论(0)    收藏  举报