import urllib.request
from lxml import etree
def create_request(page):
if page == 1:
url = 'http://www.chinaeol.net/hjxw/gnxw'
else:
url = 'http://www.chinaeol.net/hjxw/gnxw/index_' + str(page) + '.shtml'
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
#'Accept-Encoding':'gzip, deflate',
#'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
#'Cache-Control':'max-age=0',
'Cookie': 'Hm_lvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lpvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lvt_fb9e17abd59ff9a2c324890c5a701eca=1695037543; Hm_lvt_2ed05369c38555b813edc07a4dc8e126=1695037543; Hm_lpvt_fb9e17abd59ff9a2c324890c5a701eca=1695038268; Hm_lpvt_2ed05369c38555b813edc07a4dc8e126=1695038268',
'Host': 'www.chinaeol.net',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read()
return content
def get_hreflist(content):
url_list = []
tree = etree.HTML(content)
href_list = tree.xpath('//ul[@class="cj_tianlibu"]//a/@href')
href_list = [item for item in href_list if item != "javascript:;"]
url = 'http://www.chinaeol.net/hjxw/gnxw/'
for i in range(len(href_list)):
new_url = url + href_list[i]
url_list.append(new_url)
return url_list
def download_text(url_list):
failed_page_num = 0
for url in url_list:
#try:
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
#'Accept-Encoding':'gzip, deflate',
#'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
#'Cache-Control':'max-age=0',
'Cookie': 'Hm_lvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lpvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lvt_fb9e17abd59ff9a2c324890c5a701eca=1695037543; Hm_lvt_2ed05369c38555b813edc07a4dc8e126=1695037543; Hm_lpvt_fb9e17abd59ff9a2c324890c5a701eca=1695038268; Hm_lpvt_2ed05369c38555b813edc07a4dc8e126=1695038268',
'Host': 'www.chinaeol.net',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read()
tree = etree.HTML(content)
name = tree.xpath('//span[@class="toptitle"]/text()')[0]+'.txt'
name = name.replace("/","")
save_path = './生态环境部宣传教育中心/国内新闻/'+name
text = tree.xpath('//div[@class="TRS_Editor"]//span/text()')
result = ''
for t in text:
result = result + '\n' + t
with open(save_path,'w') as fp:
fp.write(result)
'''except:
failed_page_num += 1
print("{} pages failed in this page".format(failed_page_num))
pass'''
if __name__ == '__main__':
start_page = 1
end_page = 1
for page in range(start_page,end_page+1):
request = create_request(page) # 导入了第page页
content = get_content(request) # 获得第page页的源代码
url_list = get_hreflist(content) # 获得第page页所有的新闻链接
#download_text(url_list) #下载第page页所有的新闻文本
print('第' + str(page) + '页下载完成')
#except:
# print("failed to reach page {}".format(page))