from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
conn=pymysql.connect(host='127.0.0.1',user='root',passwd='password',port= 3306
,db='ISO',charset='utf8')
cursor = conn.cursor()
if conn:
print("database[ISO] 连接成功!")
def catch(url):
address_url = 'https://www.iso.org/'
text = urlopen(url).read()
soup = BeautifulSoup(text,'html.parser')
global tbody
if soup.find('table',id='datatable-committees'):
table = soup.find('table',id='datatable-committees')
tbody = table.find('tbody')
elif soup.find('table',id='datatable-committee-children'):
table = soup.find('table',id='datatable-committee-children')
tbody = table.find('tbody')
TC = set()
i = 0
j = 0
k = 0
for tr in tbody:
if len(tr) > 1:
k += 1
td = tr.findAll('td')
tc = tr.a.string.strip()
title = td[1].string.strip()
address = address_url[:-1]+tr.a['href']
sql_select = 'select count(ID) from tc_basic where tc = %s and address = %s'
cursor.execute(sql_select,(tc,address))
select_count = cursor.fetchall()
# print('select_count:',select_count)
if select_count[0][0] == 0:
i += 1
# print('This is new data ! \n ----------Insert data !---------')
sql_maxID = 'SELECT MAX(ID) FROM tc_basic'
cursor.execute(sql_maxID)
row = cursor.fetchall()
if row[0][0] is None:
id = 1
# print('This is first data! ID is : ', id)
else:
lastID = int(row[0][0])
id = lastID + 1
# print('Continue to insert , the next ID is : ', id)
sql = 'insert into tc_basic values (%s,%s,%s,%s)'
cursor.execute(sql,(id,tc,title,address))
conn.commit()
else:
j += 1
# print('This data already exists in the database! \n ----------Update data !---------')
pass
else:
pass
print('采集完毕!共计',k,'条数据:其中新增数据',i,'条,重复数据',j,'条。')
if (__name__ == '__main__'):
url = 'https://www.iso.org/standards-catalogue/browse-by-tc.html'
print('采集中:',url)
catch(url)
x = 1
sql_upID = 'SELECT MAX(ID) FROM tc_basic'
cursor.execute(sql_upID)
row = cursor.fetchall()
while x <= row[0][0]:
sql_addr = 'SELECT address FROM tc_basic WHERE id =%s'
cursor.execute(sql_addr,(x))
addr = cursor.fetchall()
address_url = addr[0][0].strip()
print('二次采集第',x,'条 addr:',address_url)
catch(address_url)
x += 1