import requests
import re
import time
import pymysql
class bdspider:
def __init__(self,tiebaName,pages_Num):
self.tiebaName = tiebaName
self.pages_Num = pages_Num
self.base_url = 'https://tieba.baidu.com/f?kw='+tiebaName+'&ie=utf-8&pn={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
#构建每一页的连接
def getlink(self):
url_list = []
for i in range(self.pages_Num):
url_list.append(self.base_url.format(i*50))
#print(url_list)
return url_list
#获取页面信息
def get_pagesinfo(self,url):
response = requests.get(url=url,headers=self.headers)
#print(response.content.decode('utf-8'))
return self.parse_pageInfo(response.content.decode('utf-8'))
#解析页面
def parse_pageInfo(self,html):
pattern = re.compile('<li class=" j_thread_list clearfix".*?<a rel="noreferrer".*?title="(.*?)".*?</a>',re.S)
return re.findall(pattern,html)
#运行逻辑
def run(self):
url_list = self.getlink()
for url in url_list:
time.sleep(1)
page_Info = self.get_pagesinfo(url)
print(page_Info)
self.save_to_mysql(page_Info)
#保存数据
def save_to_mysql(self,page_Info):
#链接数据库
conn = pymysql.connect(host='localhost',user='root',passwd='root123',db='baidu',port=3306)
#游标对象
cursor = conn.cursor()
#插入数据
for index in range(0, len(page_Info)):
tt = page_Info[index]
cursor.execute("insert into title(title) values('{}')".format(tt))
conn.commit()
#关闭游标,关闭连接
cursor.close()
conn.close()
if __name__ == "__main__":
spider = bdspider("lol",5)
spider.run()