1 from selenium import webdriver
2 import pymysql
3 from selenium.webdriver.support.ui import WebDriverWait # 等待
4 from selenium.webdriver.support import expected_conditions as ec # 等待条件
5 from selenium.webdriver.common.by import By
6 import html
7 import _thread
8 from selenium.webdriver.chrome.options import Options
9
10 def ceil(x, y):
11 if x % y == 0: # 相除后为整数
12 return int(x / y)
13 else: # 相除有小数
14 return int(x / y) + 1
15
16
17
18
19 # 创建一个浏览器
20 chrome_options = Options()
21 chrome_options.add_argument('--headless')
22 dr = webdriver.Chrome(chrome_options=chrome_options)
23
24 # 设置访问的网站
25 dr.get('https://doupocangqiong1.com/1/list_piaotian/')
26
27 # 获取所有的a标签
28 a = dr.find_elements_by_css_selector('.dirlist > li > a')
29
30 # 连接数据库
31 db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8')
32 # 获取游标
33 cursor = db.cursor()
34
35 for i in a:
36 name = i.text
37 href = i.get_attribute('href')
38 sql = "INSERT INTO novel (name,href,content) VALUES ('%s','%s','%s')"%(name,href,'')
39 cursor.execute(sql) # 使用execute方法执行SQL语句
40 db.commit()
41 dr.close() # 关闭浏览器
42
43
44
45 def line(lineName, start, count):
46 dr = webdriver.Chrome(chrome_options=chrome_options) # 创建一个浏览器
47 # 连接数据库
48 db = pymysql.connect("localhost", "root", "root", "selenium", charset='utf8')
49 # 获取游标
50 cursor = db.cursor()
51
52 sql = "SELECT id,href FROM novel LIMIT %s, %s"%(start, count)
53 cursor.execute(sql) # 使用execute方法执行SQL语句
54 data = cursor.fetchall() # 使用 fetchall() 方法获取所有数据
55 for i in data:
56 dr.get(i[1])
57 # 放置等待
58 WebDriverWait(dr, 5, 0.1).until_not(ec.text_to_be_present_in_element((By.CSS_SELECTOR, '#chaptercontent'),
59 U'正在转码,请稍后......')) # 等待dr浏览器10秒钟,每0.1秒钟问一次
60 content = html.escape(dr.find_element_by_css_selector('#chaptercontent').text)
61 # escape()将特殊字符转为特殊的编码格式,unescape()将编码格式转回特殊字符
62 sql = "UPDATE novel SET content = '%s' WHERE id = %s" % (content, i[0])
63 cursor.execute(sql) # 使用execute方法执行SQL语句
64 db.commit()
65 print(lineName, '完成了', i[0], '的采集')
66 dr.close() # 关闭窗口
67 dr.quit() # 关闭浏览器
68 cursor.close()
69 db.close()
70 print(lineName, '完成了采集')
71
72
73 def productLine(func, total, lineCount):
74 every = ceil(total[0][0], lineCount)
75 print('every', every)
76 for i in range(lineCount):
77 print('-------------', i)
78 print(_thread.start_new_thread(func, ('line-' + str(i) + '', i * every, every)))
79
80
81 try:
82 sql = 'SELECT COUNT(*) FROM novel'
83 cursor.execute(sql) # 使用execute方法执行SQL语句
84 total = cursor.fetchall() # 使用 fetchall() 方法获取所有数据
85 print(total)
86
87 productLine(line, total, 5)
88
89 except:
90 print ("Error: unable to start thread")
91
92
93 while 1:
94 pass