〈2022-2-14〉使用<beautifulsoup>爬取ISO标准网站:基础入库

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql

conn=pymysql.connect(host='127.0.0.1',user='root',passwd='password',port= 3306
                     ,db='ISO',charset='utf8')
cursor = conn.cursor()
if conn:
    print("database[ISO] 连接成功!")

url = 'https://www.iso.org/standards-catalogue/browse-by-tc.html'
address_url = 'https://www.iso.org'
text = urlopen(url).read()
soup = BeautifulSoup(text,'html.parser')
table = soup.find('table',id='datatable-committees')
# print('table:',table)
tbody = table.find('tbody')
# print('tbody:',tbody)
TC = set()
i = 1
for tr in tbody:
    if len(tr) > 1:
        td = tr.findAll('td')
        id = i
        tc = tr.a.string.strip()
        title = td[1].string.strip()
        address = address_url+tr.a['href']
        sql = 'insert into tc_basic values (%s,%s,%s,%s)'
        cursor.execute(sql,(id,tc,title,address))
        conn.commit()
        i += 1
print('采集完毕!共计',i-1,'条数据。')

 

posted @ 2022-02-14 17:07  一只路过的红  阅读(56)  评论(0)    收藏  举报