1 # -*- coding: utf-8 -*-
2 # @Time : 2019/11/12 21:22
3 # @Author : AForever
4 # @Site :
5 # @File : cnblog_002.py
6 # @Software: PyCharm
7
8 from urllib import request
9 from bs4 import BeautifulSoup
10 import os
11 import pymysql
12
13
14 # 获取数据
15 def get_data():
16 headers = {
17 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
18 }
19
20 for i in range(7):
21 url = "https://search.51job.com/list/040000,000000,0000,00,9,99,python%25E5%25BC%2580%25E5%258F%2591,2," + str(i+1) + ".html"
22 req = request.Request(url, headers=headers)
23 file_name = 'E:\\python_project\\Spider\\cnblogs\\data\\cnblog_pythonjob' + str(i + 1) + '.html'
24 print(file_name, url)
25 response = request.urlopen(req)
26
27 if response.getcode() == 200:
28 data = response.read()
29 data = str(data, encoding='gbk')
30 with open(file_name, mode="w", encoding="gbk") as f:
31 f.write(data)
32 print("*"*15, "get data success", "*"*15)
33
34
35 # 解析数据
36 def parse_data():
37 path = 'E:\\python_project\\Spider\\cnblogs\\data\\'
38 filenames = os.listdir(path)
39 result = []
40 for filename in filenames:
41 # print(filename)
42 with open(path+filename, mode="r", encoding="gbk") as f:
43 html = f.read()
44 bs = BeautifulSoup(html, 'html.parser')
45 divs = bs.select('#resultList .el')
46 for div in divs[1:]:
47 title = div.select('.t1')[0].get_text(strip=True)
48 company = div.select('.t2')[0].get_text(strip=True)
49 addr = div.select('.t3')[0].get_text(strip=True)
50 salary = div.select('.t4')[0].get_text(strip=True)
51 pubdate = div.select('.t5')[0].get_text(strip=True)
52 row = {
53 'title': title,
54 'company': company,
55 'addr': addr,
56 'salary': salary,
57 'pubdate': pubdate
58 }
59 result.append(row)
60 # print(result)
61
62 print('*' * 15, 'parse data success, ,Congratulations!', '*' * 15)
63 # print(result)
64 return result
65
66
67 # 创建数据表
68 def create_table():
69 config = {
70 'host': 'localhost',
71 'port': 3306,
72 'user': 'root',
73 'password': '123456',
74 'database': 'python',
75 'charset': 'utf8'
76 }
77 conn = pymysql.connect(**config)
78 cursor = conn.cursor()
79 # 如果存在student表,则先删除
80 try:
81 cursor.execute('DROP TABLE IF EXISTS `t_job`;')
82 conn.commit()
83 print('*' * 15, "drop table success", '*' * 15)
84 except:
85 print('*' * 15, 'table dose not exist', '*' * 15)
86
87 create_table = '''
88 create table t_job(
89 id int primary key auto_increment,
90 title varchar(200),
91 company varchar(200),
92 addr varchar(200),
93 salary varchar(200),
94 pubdate varchar(200)
95 )engine=Innodb charset utf8;
96 '''
97 # 创建数据表
98 cursor.execute(create_table)
99 cursor.close()
100 conn.close()
101 print('*' * 15, 'create tables success,Congratulations!', '*' * 15)
102
103
104 # 存储数据到mysql
105 def save_to_mysql(data):
106 config = {
107 'host': 'localhost',
108 'port': 3306,
109 'user': 'root',
110 'password': 'lem600@HW',
111 'database': 'python',
112 'charset': 'utf8'
113 }
114
115 conn = pymysql.connect(**config)
116 cursor = conn.cursor()
117 sql = '''
118 insert into t_job(title, company, addr, salary, pubdate)
119 values(%(title)s,%(company)s,%(addr)s,%(salary)s,%(pubdate)s)
120 '''
121 cursor.executemany(sql, data)
122 conn.commit()
123 cursor.close()
124 conn.close()
125 print('*' * 15, 'save data to mysql success ,Congratulations !', '*' * 15)
126
127
128 if __name__ == "__main__":
129 get_data()
130 # parse_data()
131 create_table()
132 save_to_mysql(parse_data())