爬虫
import requests from requests.exceptions import RequestException import re from multiprocessing import Pool def get_one_page(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } reponse = requests.get(url, headers=headers) if reponse.status_code == 200: return reponse.text return None except RequestException: return None def prase_one_page(html): pattern = re.compile( '<dd>.*?board-index.*?>(.*?)</i>.*?title="(.*?)".*?data-src="(.*?)".*?"star">(.*?)</p>.*?time">(.*?)</p>', re.S) items = re.findall(pattern, html) for items in items: yield{ 'index': items[0], 'image': items[1], 'title': items[2], 'actor': items[3].strip()[3:], 'time': items[4].strip()[5:] } def main(offset): url = 'https://maoyan.com/board/6?offset='+str(offset) html = get_one_page(url) for item in prase_one_page(html): print(item) if __name__ == '__main__': pool = Pool() pool.map(main, [i * 10 for i in range(10)])
一起讨论

浙公网安备 33010602011771号