#!/user/bin/env python
# -*- conding:utf-8 -*-
import requests
from lxml import etree
import json
class BtcSpider(object):
def __init__(self):
self.base_url = 'https://8btc.com/forum-61-'
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6823.400 QQBrowser/10.3.3117.400'}
self.data_list = []
#发送请求
def get_response(self,url):
response = requests.get(url,headers=self.headers)
#head--meta-charset
#抓取网页的编码是gbk
data = response.content.decode('gbk')
return data
#解析网页
def parse_data(self,data):
#使用xpath解析当前页面
#转类型
x_data = etree.HTML(data)
#根据xpath路径解析
#路径 手写 借助浏览器 右击粘贴xpath路径,需要修改
tittle_list = x_data.xpath('//a[@class="s xst"]/text()')
# tittle_list = x_data.xpath('//from[@id="moderate"]/div/div[2]/div/a[1]/text()')
#模糊查询 //div[contain(@id,"normathread")]
url_list = x_data.xpath('//a[@class="s xst"]/@href')
for index,tittle in enumerate(tittle_list):
news = {}
# print(index)
# print(tittle)
news['name'] = tittle
news['url'] = url_list[index]
self.data_list.append(news)
#保存数据
def save_data(self):
#将列表转换成字符串
data_str = json.dumps(self.data_list)
with open('05btc.json','w') as f:
f.write(data_str)
#启动
def run(self):
#拼接完整url
for i in range(1,10):
url =self.base_url + str(i)+'.html'
#发送请求
data = self.get_response(url)
#做解析
parse_data = self.parse_data(data)
#保存
self.save_data()
BtcSpider().run()