from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
def gethreflist(url)
# Set the headers for the request
headers = {
'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Run Chrome in headless mode
# Initialize the WebDriver
driver = webdriver.Chrome(options=chrome_options)
# Open the URL with the specified headers
url = "http://www.chinaenvironment.com/zxxwlb/index_123.html"
driver.get(url)
# Scroll to the end of the page
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
# Click the "more content" button until it can't be found
while True:
try:
more_content_button = driver.find_element(By.XPATH, '//a[@class="getMore"]')
more_content_button.click()
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
except:
break
# Get the final content after clicking all "more content" buttons
content = driver.page_source
# Close the WebDriver
driver.quit()
url_list = []
tree = etree.HTML(content)
href_list = tree.xpath('//div[@class="sideL fl"]//a[@class = "title"]/@href')
href_list = [item for item in href_list if item != "javascript:;"]
url = 'http://www.chinaenvironment.com'
for i in range(len(href_list)):
new_url = url + href_list[i]
url_list.append(new_url)
return url_list
def download_text(url_list):
failed_page_num = 0
for url in url_list:
try:
headers = {
'Accept':'text/html, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Cookie':'ASP.NET_SessionId=ycrocaebez3wg5fvn30v1mjv',
'Host':'www.chinaenvironment.com',
'Proxy-Connection':'keep-alive',
'Referer':'http://www.chinaenvironment.com/zxxwlb/index_123_114250.html',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read()
tree = etree.HTML(content)
name = tree.xpath('//div[@class="articleTit"]/text()')[0]+'.txt'
name = name.replace("/","")
save_path = './环保网/新闻/'+name
text = tree.xpath('//div[@class="edits"]//span/text()')
result = ''
for t in text:
result = result + '\n' + t
with open(save_path,'w') as fp:
fp.write(result)
except:
failed_page_num += 1
print("{} pages failed in this page".format(failed_page_num))
pass
if __name__ == '__main__':
url_list = gethreflist(url) # 获得main page页所有的新闻链接
download_text(url_list) #下载所有链接中的新闻文本
print('download complete!!')