python多线程爬取案例

import requests
from lxml import etree
import re
import csv
from concurrent.futures import ThreadPoolExecutor


def getOnePageData(url, writer):
	resp = requests.get(url)
	html = etree.HTML(resp.text)
	trs = html.xpath('/html/body/div[2]/div[4]/div[1]/table//tr')[1:]

	for tr in trs:
		text = tr.xpath('.//text()')
		text = [re.sub(r'\\|/', '', item) for item in text]
		writer.writerow(text)
		print(text)


def main():
	f = open('data.csv', 'w', newline='')
	writer = csv.writer(f)
	with ThreadPoolExecutor(10) as t:
		for i in range(10):
			url = "http://xinfadi.com.cn/marketanalysis/0/list/{0}.shtml".format(i+1)
			t.submit(getOnePageData, url, writer)


if __name__ == "__main__":
	main()
posted @ 2021-04-16 17:36  程序员陈师兄cxycsx  阅读(77)  评论(0编辑  收藏  举报