from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from lxml import etree
import time
option = ChromeOptions()
option.add_argument(
'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36"'
)
browser = webdriver.Chrome(options=option)
browser.maximize_window() # 页面最大化
def get_content(keyword):
url = 'http://corpus.zhonghuayuwen.org/ACindex.aspx'
browser.get(url)
input_tag = browser.find_element_by_id('TextBoxACkeywords') #获取搜索框元素
input_tag.send_keys(keyword) #输入关键字
browser.find_element_by_id('RadioButtonLIKE').click()#点击选项按钮
input_tag.send_keys(Keys.ENTER) #回车
# browser.implicitly_wait(8) #隐式等待
WebDriverWait(browser,8).until(EC.presence_of_element_located((By.ID,'PanellSResults'))) #显示等待
current_page = 0
while True:
try:
lists = []
HTML = etree.HTML(browser.page_source)
text_lists = HTML.xpath('//*[@id="PanellSResults"]/div/span[position()>3]')
if text_lists:
current_page += 1
print('\n------------------------------当前关键字:《{}》,当前页码:{}------------------------------。'.format(keyword,current_page))
for i in text_lists:
text_list = ''.join(i.xpath('.//text()'))
lists.append(text_list)
step = 3
item_lists = [lists[k:k+step] for k in range(0,len(lists),step)] #处理合适的数据结构
for item in item_lists:
text_info = ''.join(item)
save_keyword_info(text_info)
time.sleep(2)
next_button = browser.find_element_by_link_text('下一页') #循环点击下一页
next_button.click()
WebDriverWait(browser, 8).until(EC.presence_of_element_located((By.ID, 'PanellSResults'))) #显示等待
else:
print('检索不到:《{}》关键字的语料信息。\n'.format(keyword))
invalid_keyword(keyword)
break
except:
break
#记录找不到结果的关键字信息
def invalid_keyword(keyword):
with open('invalid_data.txt','a+')as f:
f.write(keyword + '\n')
#保存关键字语料信息
def save_keyword_info(text_info):
with open('corpus_data_01.txt','a+',encoding='utf-8')as f:
f.write(text_info + '\n')
print(text_info)
#读取关键字文件
def read_text():
with open('生僻字++.txt','r',encoding='utf-8')as f:
data_lists = f.readlines()
for i in data_lists:
keyword = i.strip()
print('\n开始抓取关键字:《{}》。'.format(keyword))
get_content(keyword)
time.sleep(6)
if __name__ == '__main__':
read_text()