from lxml import etree
import time
import json
import urllib.request
item_list = [] # 创建一个列表存储获取的信息
# 构造request对象
def handler_request(url, page):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\
WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
get_url = url + str(page)
request = urllib.request.Request(url=get_url, headers=headers)
return request
# 解析获取的html文件
def parse_content(content):
# 生成对象
tree = etree.HTML(content)
article_list = tree.xpath('//main[@class="col-md-8 main-content"]/article')
# 遍历article列表
for article in article_list:
# 获取标题
title = article.xpath('.//div[@class="post-head"]/h1/a/text()')[0]
# 获取内容
text = article.xpath('.//div[@class="post-content"]/p/text()')
text = '\n'.join(text) # 将内容进行拼接
item = {
'标题': title,
'内容': text,
}
item_list.append(item)
def main():
start_page = int(input("请输入查询起始页面:"))
end_page = int(input("查询结束页面:"))
url = "http://duanziwang.com/page/"
for page in range(start_page, end_page+1):
request = handler_request(url, page)
try:
content = urllib.request.urlopen(request).read().decode()
parse_content(content)
except:
print("第%d页面爬取失败" % page)
string = json.dumps(item_list, ensure_ascii=False)
with open('duanzi.txt', "w", encoding='utf-8') as f:
f.write(string)
if __name__ == '__main__':
main()