爬取百度标题保存到数据库

import requests
import re 
import time
import pymysql
class bdspider:
    def __init__(self,tiebaName,pages_Num):
        self.tiebaName = tiebaName
        self.pages_Num = pages_Num
        self.base_url = 'https://tieba.baidu.com/f?kw='+tiebaName+'&ie=utf-8&pn={}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
        }
    
    #构建每一页的连接
    def getlink(self):
        url_list = []
        for i in range(self.pages_Num):
            url_list.append(self.base_url.format(i*50))
        #print(url_list)
        return url_list
    
    #获取页面信息
    def get_pagesinfo(self,url):
        response = requests.get(url=url,headers=self.headers)
        #print(response.content.decode('utf-8'))
        
        return self.parse_pageInfo(response.content.decode('utf-8'))
    #解析页面
    def parse_pageInfo(self,html):
        pattern = re.compile('<li class=" j_thread_list clearfix".*?<a rel="noreferrer".*?title="(.*?)".*?</a>',re.S)
        return re.findall(pattern,html)
    
    #运行逻辑
    def run(self):
        url_list = self.getlink()
        for url in url_list:
            time.sleep(1)
            page_Info = self.get_pagesinfo(url)
            print(page_Info)
            self.save_to_mysql(page_Info)
            
    #保存数据
    def save_to_mysql(self,page_Info):
        #链接数据库
        conn = pymysql.connect(host='localhost',user='root',passwd='root123',db='baidu',port=3306)

            #游标对象
        cursor = conn.cursor()

            #插入数据
        for index in range(0, len(page_Info)):
            tt = page_Info[index]
            cursor.execute("insert into title(title) values('{}')".format(tt))
            conn.commit()
            #关闭游标,关闭连接
        cursor.close()
        conn.close()
if __name__ == "__main__":
    spider = bdspider("lol",5)
    spider.run()

 

posted @ 2019-08-26 10:14  芦苇了嘿  阅读(271)  评论(0编辑  收藏  举报