import requests
from lxml import etree
def create_request(page):
if page == 1:
url = 'http://www.zhb.org.cn/hbzx/news_2'
else:
url = 'http://www.zhb.org.cn/hbzx/news_2/index_' + str(page) + '.html'
headers = {
'Host': 'www.zhb.org.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
request = requests.get(url, headers=headers)
return request
def get_content(request):
content = request.content
return content
def get_hreflist(content):
url_list = []
tree = etree.HTML(content)
href_list = tree.xpath('//div[@class="newsbox_2"]//li/a/@href')
href_list = [item for item in href_list if item != "javascript:;"]
url = 'http://www.zhb.org.cn'
for i in range(len(href_list)):
new_url = url + href_list[i]
url_list.append(new_url)
return url_list
def download_text(url_list):
failed_page_num = 0
for url in url_list:
try:
headers = {
'Host': 'www.zhb.org.cn',
'If-Modified-Since': 'Sun, 17 Sep 2023 16:48:28 GMT',
'If-None-Match': '"42c4b-7865-60590cb85967c"',
'Upgrade-Insecure-Requests': '1',
'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
content = response.content
tree = etree.HTML(content)
name = tree.xpath('//div[@class="news_titlenr"]/text()')[0]+'.txt'
name = name.replace("/","")
save_path = './中国环境保护协会/新闻/'+name
text = tree.xpath('//div[@class="news_nrbox"]//p/text()')
result = ''
for t in text:
result = result + '\n' + t
with open(save_path,'w',encoding='utf-8') as fp:
fp.write(result)
except:
failed_page_num += 1
print("{} pages failed in this page".format(failed_page_num))
pass
if __name__ == '__main__':
start_page = 2
end_page = 263
for page in range(start_page,end_page+1):
request = create_request(page) # 导入了第page页
content = get_content(request) # 获得第page页的源代码
url_list = get_hreflist(content) # 获得第page页所有的新闻链接
download_text(url_list) #下载第page页所有的新闻文本
print('第' + str(page) + '页下载完成')