python 多线程爬取段子网,存储txt文件并存储mysql数据库

import threading
import time, os, re, requests, pymysql
from bs4 import BeautifulSoup
from queue import Queue

class Producer(threading.Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"
    }

    def __init__(self, page_queue, server_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.server_queue = server_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.sepider_duanzi(url)

    def sepider_duanzi(self, url):
        response = requests.get(url, headers=self.headers, timeout=10)
        text = response.text
        soup = BeautifulSoup(text, 'html.parser')
        div = soup.select("dl[class='xhlist']")
        tag = []

        def replece(data):
            if not link.select(data):
                return 0
            else:
                return link.select(data)[0].string

        for link in div:
            title = link.select("strong")[0].string
            title = re.sub(r'[/??\.。!!·]', '', title)
            center = link.select("dd")[1].get_text().strip('\t')
            if not link.select("dd")[1].get_text():
                center = '没有内容!'
            zan = replece("p[class='zan']")
            bs = replece("p[class='bs']")
            self.img_queue.put(('text', title, center, zan, bs))


class Consumer(threading.Thread):
    def __init__(self, page_queue, server_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.server_queue = server_queue

    def run(self):
        while True:
            if self.server_queue.empty() and self.page_queue.empty():
                break
            filename, title, center, zan, bs = self.server_queue.get(block=True)
            self.file_name(filename, title, center)
            self.server_mysql(title, center, zan, bs)
            time.sleep(2)

    # 写入txt文件
    def file_name(self, filename, title, center):
        if not os.path.exists(filename):
            os.makedirs(filename)
        wj_name = os.path.join(filename, title) + '.txt'
        with open(wj_name, 'w', encoding='utf-8') as f:
            f.write(center)
            print('(%s)文件写入成功!!!' % title)

    # 写入mysql文件
    def server_mysql(self, title, center, zan, bs):
        con = pymysql.connect('127.0.0.1', 'root', 'root', 'duanzi', charset='utf8')
        cursor = con.cursor()
        self.title = title
        self.center = center
        self.zan = zan
        self.bs = bs
        sql = "INSERT INTO nr(title,center,bs,zan) VALUES('%s','%s','%s','%s')" % (
            self.title, self.center, self.zan, self.bs)
        try:
            cursor.execute(sql)
            con.commit()
        except BaseException:
            print(sql)
            con.rollback()
            print('执行错误error.....')


def main():
    page_queue = Queue(100)
    server_queue = Queue()
    for i in range(1, 100):
        url = 'https://www.biedoul.com/index/%d/' % i
        print('正在下载%s....' % i)
        page_queue.put(url)
    for x in range(5):
        t = Producer(page_queue, server_queue)
        t.start()

    for x in range(5):
        t = Consumer(page_queue, server_queue)
        t.start()


if __name__ == '__main__':
    main()

  

posted @ 2021-05-13 18:53  少喝点酒  阅读(144)  评论(0)    收藏  举报