13天搞定Python分布爬虫(第四天)
21-多线程的使用(原码在百度网盘对应目录):以下代码运行正常。
from queue import Queue from threading import Thread import requests from fake_useragent import UserAgent from lxml import etree class CrawlInfo(Thread): def __init__(self, url_queue, html_queue): Thread.__init__(self) self.url_queue = url_queue self.html_queue = html_queue def run(self): headers = { 'User-Agent': UserAgent().random } while not self.url_queue.empty(): resp = requests.get(self.url_queue.get(), headers=headers) self.html_queue.put(resp.text) class ParseInfo(Thread): def __init__(self, html_queue): super().__init__() self.html_queue = html_queue def run(self): while self.html_queue.empty() == False: e = etree.HTML(self.html_queue.get()) span_contents = e.xpath('//div[@class="content"]/span[1]') with open('段子.txt', 'a', encoding='utf-8') as f: for span in span_contents: info = span.xpath('string(.)') f.write(info + '\n') def main(): url_queue = Queue() html_queue = Queue() base_url = 'https://www.qiushibaike.com/text/page/{}/' for i in range(1, 10): url = base_url.format(i) url_queue.put(url) crawl_list = [] for i in range(3): crawl = CrawlInfo(url_queue, html_queue) crawl_list.append(crawl) crawl.start() for crawl in crawl_list: crawl.join() parse_list = [] for i in range(3): parse = ParseInfo(html_queue) parse_list.append(parse) parse.start() for parse in parse_list: parse.join() if __name__ == '__main__': main()
22-tesseract的使用
import pytesseract from PIL import Image img = Image.open('yzm1.png') # img=Image.open('yzm2.jpg') code = pytesseract.image_to_string(img) print(code)
运行结果:

C:\Users\xiongjiawei\PycharmProjects\Spider\venv\Scripts\python.exe C:/Users/xiongjiawei/PycharmProjects/Spider/13天搞定Python分布式爬虫/第04天/v02-tesseract的使用.py 7572 Process finished with exit code 0
23-云打码平台的使用
24-云打码登录
25-爬取图文并茂文章方法
26-selenium的使用
……