import threading
import time, os, re, requests, pymysql
from bs4 import BeautifulSoup
from queue import Queue
class Producer(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"
}
def __init__(self, page_queue, server_queue, *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.server_queue = server_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.sepider_duanzi(url)
def sepider_duanzi(self, url):
response = requests.get(url, headers=self.headers, timeout=10)
text = response.text
soup = BeautifulSoup(text, 'html.parser')
div = soup.select("dl[class='xhlist']")
tag = []
def replece(data):
if not link.select(data):
return 0
else:
return link.select(data)[0].string
for link in div:
title = link.select("strong")[0].string
title = re.sub(r'[/??\.。!!·]', '', title)
center = link.select("dd")[1].get_text().strip('\t')
if not link.select("dd")[1].get_text():
center = '没有内容!'
zan = replece("p[class='zan']")
bs = replece("p[class='bs']")
self.img_queue.put(('text', title, center, zan, bs))
class Consumer(threading.Thread):
def __init__(self, page_queue, server_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.server_queue = server_queue
def run(self):
while True:
if self.server_queue.empty() and self.page_queue.empty():
break
filename, title, center, zan, bs = self.server_queue.get(block=True)
self.file_name(filename, title, center)
self.server_mysql(title, center, zan, bs)
time.sleep(2)
# 写入txt文件
def file_name(self, filename, title, center):
if not os.path.exists(filename):
os.makedirs(filename)
wj_name = os.path.join(filename, title) + '.txt'
with open(wj_name, 'w', encoding='utf-8') as f:
f.write(center)
print('(%s)文件写入成功!!!' % title)
# 写入mysql文件
def server_mysql(self, title, center, zan, bs):
con = pymysql.connect('127.0.0.1', 'root', 'root', 'duanzi', charset='utf8')
cursor = con.cursor()
self.title = title
self.center = center
self.zan = zan
self.bs = bs
sql = "INSERT INTO nr(title,center,bs,zan) VALUES('%s','%s','%s','%s')" % (
self.title, self.center, self.zan, self.bs)
try:
cursor.execute(sql)
con.commit()
except BaseException:
print(sql)
con.rollback()
print('执行错误error.....')
def main():
page_queue = Queue(100)
server_queue = Queue()
for i in range(1, 100):
url = 'https://www.biedoul.com/index/%d/' % i
print('正在下载%s....' % i)
page_queue.put(url)
for x in range(5):
t = Producer(page_queue, server_queue)
t.start()
for x in range(5):
t = Consumer(page_queue, server_queue)
t.start()
if __name__ == '__main__':
main()