Python爬虫:使用Selenium爬取指定上市公司(如浦发银行)的今年公告信息

 

 

 1 from selenium import webdriver#导入库
 2 from selenium.webdriver.common.keys import Keys
 3 from bs4 import BeautifulSoup
 4 import csv,time
 5 import os,re
 6 import requests
 7 import selenium.webdriver.support.ui as ui
 8 import  urllib
 9 
10 chromeOptions = webdriver.ChromeOptions()
11 prefs = {"download.default_directory":"D:\\pufa"}
12 chromeOptions.add_experimental_option("prefs", prefs)
13 browser = webdriver.Chrome(chrome_options=chromeOptions)#声明浏览器
14 
15 positon = {}
16 def enterinfo():
17     url = 'http://www.cninfo.com.cn/new/commonUrl?url=disclosure/list/search'
18     browser.get(url)#打开浏览器预设网址
19     input = browser.find_element_by_css_selector('input[placeholder $= "标题关键字"]')
20     input.send_keys('浦发银行')
21 
22     browser.find_element_by_class_name("el-range__close-icon").click()#删除原来日期信息
23     input = browser.find_element_by_css_selector('input[placeholder $= "开始日期"]')
24     input.send_keys('2020-01-01')
25 
26     input = browser.find_element_by_css_selector('input[placeholder $= "结束日期"]')
27     xianzai = time.strftime("%Y-%m-%d", time.localtime())
28     input.send_keys(xianzai)
29 
30     time.sleep(2)
31 
32     browser.find_elements_by_xpath('//*[@id="main"]/div[2]/div[1]/div[2]/div[1]/div[2]/div[1]/button/span')[0].click()
33     #填写文本
34     time.sleep(2)#睡眠5
35 
36 def GainPage():
37     source = browser.page_source  # 打印网页源代码
38     soup = BeautifulSoup(source, 'lxml')
39 
40     ul_list = soup.select('div.el-table__body-wrapper')[0]
41     for ul in  ul_list.select('tr.el-table__row'):
42         web = ul.select('td.el-table_1_column_3')[0].select('span.ahover')[0].select('a')[0]
43         webs = web.get('href')
44 
45         url = 'http://www.cninfo.com.cn'
46         url =  url + webs
47 
48         biaoti = web.text
49         positon[biaoti] = url
50 
51 
52     time.sleep(2)
53 enterinfo()
54 i = 1
55 while(i):
56     GainPage()
57     browser.find_elements_by_xpath('//*[@id="main"]/div[2]/div[1]/div[1]/div[3]/div/button[2]/i')[0].click()
58     i = i+1
59     if i==9:
60         break
61 print(len(positon))
62 
63 
64 for it in positon.items():
65     print(it)
66 for val in positon.values():
67 
68     url = val
69     browser.get(url)  # 打开浏览器预设网址
70 
71     browser.find_elements_by_xpath('//*[@id="noticeDetail"]/div/div[1]/div[3]/div[1]/button/span')[0].click()

 

posted @ 2020-11-14 13:31  AsunaLGLL  阅读(616)  评论(0编辑  收藏  举报