〈2022-2-15〉使用<beautifulsoup>爬取ISO标准网站：基础入库（下一层数据采集入库）

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql

conn=pymysql.connect(host='127.0.0.1',user='root',passwd='password',port= 3306
                     ,db='ISO',charset='utf8')
cursor = conn.cursor()
if conn:
    print("database[ISO] 连接成功!")

def catch(url):
    address_url = 'https://www.iso.org/'
    text = urlopen(url).read()
    soup = BeautifulSoup(text,'html.parser')
    global tbody
    if soup.find('table',id='datatable-committees'):
        table = soup.find('table',id='datatable-committees')
        tbody = table.find('tbody')
    elif soup.find('table',id='datatable-committee-children'):
        table = soup.find('table',id='datatable-committee-children')
        tbody = table.find('tbody')
    TC = set()
    i = 0
    j = 0
    k = 0
    for tr in tbody:
        if len(tr) > 1:
            k += 1
            td = tr.findAll('td')
            tc = tr.a.string.strip()
            title = td[1].string.strip()
            address = address_url[:-1]+tr.a['href']
            sql_select = 'select count(ID) from tc_basic where tc = %s and address = %s'
            cursor.execute(sql_select,(tc,address))
            select_count = cursor.fetchall()
            # print('select_count:',select_count)
            if select_count[0][0] == 0:
                i += 1
                # print('This is new data ! \n ----------Insert data !---------')
                sql_maxID = 'SELECT MAX(ID) FROM tc_basic'
                cursor.execute(sql_maxID)
                row = cursor.fetchall()
                if row[0][0] is None:
                    id = 1
                    # print('This is first data!  ID is : ', id)
                else:
                    lastID = int(row[0][0])
                    id = lastID + 1
                    # print('Continue to insert , the next ID is : ', id)
                sql = 'insert into tc_basic values (%s,%s,%s,%s)'
                cursor.execute(sql,(id,tc,title,address))
                conn.commit()
            else:
                j += 1
                # print('This data already exists in the database! \n ----------Update data !---------')
                pass
        else:
            pass
    print('采集完毕！共计',k,'条数据：其中新增数据',i,'条，重复数据',j,'条。')

if (__name__ == '__main__'):
    url = 'https://www.iso.org/standards-catalogue/browse-by-tc.html'
    print('采集中：',url)
    catch(url)
    x = 1
    sql_upID = 'SELECT MAX(ID) FROM tc_basic'
    cursor.execute(sql_upID)
    row = cursor.fetchall()
    while x <= row[0][0]:
        sql_addr = 'SELECT address FROM tc_basic WHERE id =%s'
        cursor.execute(sql_addr,(x))
        addr = cursor.fetchall()
        address_url = addr[0][0].strip()
        print('二次采集第',x,'条  addr:',address_url)
        catch(address_url)
        x += 1

posted @ 2022-02-15 16:54 一只路过的红阅读(52) 评论(0) 收藏举报

刷新页面返回顶部

一只路过的红

〈2022-2-15〉使用<beautifulsoup>爬取ISO标准网站：基础入库（下一层数据采集入库）

公告