〈2022-3-9〉使用<beautifulsoup>爬取ISO标准网站：已完成，代码记录（问题很多勉强把数据采集完了就这样吧）

TC：

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql

conn=pymysql.connect(host='127.0.0.1',user='root',passwd='******',port= 3306
                     ,db='ISO',charset='utf8')
cursor = conn.cursor()
if conn:
    print("database[ISO] 连接成功!")

def catch(url):
    address_url = 'https://www.iso.org/'
    text = urlopen(url).read()
    soup = BeautifulSoup(text,'html.parser')
    global tbody
    if soup.find('table',id='datatable-committees'):
        table = soup.find('table',id='datatable-committees')
        tbody = table.find('tbody')
    elif soup.find('table',id='datatable-committee-children'):
        table = soup.find('table',id='datatable-committee-children')
        tbody = table.find('tbody')
    TC = set()
    i = 0
    j = 0
    k = 0
    for tr in tbody:
        if len(tr) > 1:
            k += 1
            td = tr.findAll('td')
            tc = tr.a.string.strip()
            title = td[1].string.strip()
            address = address_url[:-1]+tr.a['href']
            sql_select = 'select count(ID) from tc_basic where tc = %s and address = %s'
            cursor.execute(sql_select,(tc,address))
            select_count = cursor.fetchall()
            # print('select_count:',select_count)
            if select_count[0][0] == 0:
                i += 1
                # print('This is new data ! \n ----------Insert data !---------')
                sql_maxID = 'SELECT MAX(ID) FROM tc_basic'
                cursor.execute(sql_maxID)
                row = cursor.fetchall()
                if row[0][0] is None:
                    id = 1
                    # print('This is first data!  ID is : ', id)
                else:
                    lastID = int(row[0][0])
                    id = lastID + 1
                    # print('Continue to insert , the next ID is : ', id)
                sql = 'insert into tc_basic values (%s,%s,%s,%s)'
                cursor.execute(sql,(id,tc,title,address))
                conn.commit()
            else:
                j += 1
                # print('This data already exists in the database! \n ----------Update data !---------')
                pass
        else:
            pass
    print('采集完毕！共计',k,'条数据：其中新增数据',i,'条，重复数据',j,'条。')

if (__name__ == '__main__'):
    url = 'https://www.iso.org/standards-catalogue/browse-by-tc.html'
    print('采集中：',url)
    catch(url)
    x = 1
    sql_upID = 'SELECT MAX(ID) FROM tc_basic'
    cursor.execute(sql_upID)
    row = cursor.fetchall()
    while x <= row[0][0]:
        sql_addr = 'SELECT address FROM tc_basic WHERE id =%s'
        cursor.execute(sql_addr,(x))
        addr = cursor.fetchall()
        address_url = addr[0][0].strip()
        print('二次采集第',x,'条  addr:',address_url)
        catch(address_url)
        x += 1

Stage：

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql

conn=pymysql.connect(host='127.0.0.1',user='root',passwd='******',port=3306
                     ,db='ISO',charset='utf8')
cursor = conn.cursor()
if conn:
    print("database[ISO] 连接成功!")

def catch(url):
    text = urlopen(url).read()
    soup = BeautifulSoup(text,'html.parser')
    table = soup.find('table',attrs={'class':'table table-condensed responsive-table'})
    tbody = table.find('tbody')
    # print('tbody:',tbody)
    sid = 1
    for tr in tbody:
        if len(tr) > 1:
            num_td = 0
            for td in tr:
                if td.text.strip():
                    if num_td == 0:
                        num_td += 1
                        pass
                    else:
                        stage_id = sid
                        stage_mix = td.text.strip()
                        # print('stage:',stage_mix)
                        stage_no = stage_mix[:5]
                        print('stage:',stage_no)
                        interpretation = stage_mix[5:]
                        print('interpretation:',interpretation)
                        num_td += 1
                        sid += 1
                        sql = 'insert into stage values (%s,%s,%s)'
                        cursor.execute(sql,(stage_id,stage_no,interpretation))
                        conn.commit()
    print('采集完毕！共计',sid-1,'条数据。')

if (__name__ == '__main__'):
    url = 'https://www.iso.org/stage-codes.html'
    print('采集中：',url)
    catch(url)

ICS：第一层

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql

conn=pymysql.connect(host='127.0.0.1',user='root',passwd='******',port=3306
                     ,db='ISO',charset='utf8')
cursor = conn.cursor()
if conn:
    print("database[ISO] 连接成功!")

def catch(url):
    address_url = 'https://www.iso.org/'
    text = urlopen(url).read()
    soup = BeautifulSoup(text,'html.parser')
    table = soup.find('table',id='datatable-ics')
    tbody = table.find('tbody')
    ics_id = 1
    for tr in tbody:
        if len(tr) > 1:
            ics = tr.a.string.strip()
            print('ics:',ics)
            address = address_url[:-1] + tr.a['href']
            print('address:',address)
            FIELD = tr.find('td',attrs={'data-title':'Field'})
            field = FIELD.text.strip()
            print('field:',field)
            sql = 'insert into ics_basic values (%s,%s,%s,%s)'
            cursor.execute(sql,(ics_id,ics,field,address))
            conn.commit()
            ics_id += 1
    print('采集完毕！共计',ics_id-1,'条数据。')

if (__name__ == '__main__'):
    url = 'https://www.iso.org/standards-catalogue/browse-by-ics.html'
    print('采集中：',url)
    catch(url)

ICS：第二层

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
import re

conn=pymysql.connect(host='127.0.0.1',user='root',passwd='******',port=3306
                     ,db='ISO',charset='utf8')
cursor = conn.cursor()
if conn:
    print("database[ISO] 连接成功!")

def catch(url,B_id,mark):
    address_url = 'https://www.iso.org/'
    text = urlopen(url).read()
    soup = BeautifulSoup(text,'html.parser')
    table = soup.find('table',id='datatable-ics-children')
    if table:
        tbody = table.find('tbody')
        i = 0
        j = 0
        k = 0
        for tr in tbody:
            if len(tr) > 1:
                k += 1
                if tr.find('a'):
                    ics = tr.a.string.strip()
                    address = address_url[:-1] + tr.a['href']
                    if mark == 1:
                        address_list.append(address)
                    else:
                        print('第二次采集，地址不再追加。')
                        pass
                else:
                    td_ics = tr.find('td',attrs={'data-title':'ICS'})
                    ics = td_ics.text.strip()
                    address = 'null'
                    if mark == 1:
                        address_list.append(address_url)
                    else:
                        print('第二次采集，地址不再追加。')
                        pass

                print('ics:',ics)
                print('address:',address)
                FIELD = tr.find('td',attrs={'data-title':'Field'})
                field_all = FIELD.text.strip()
                re_field = re.sub('\n+',';', field_all)
                re_re_field = re.sub(' +', ' ', re_field)
                field = re_re_field
                print('re_re_field:',re_re_field)
                # print('field_all:',field_all)
                sql_select = 'select count(ICS_ID) from ics where ICS = %s'
                cursor.execute(sql_select,(ics))
                select_count = cursor.fetchall()
                # print('select_count:',select_count)
                if select_count[0][0] == 0:
                    i += 1
                    # print('This is new data ! \n ----------Insert data !---------')
                    sql_maxID = 'SELECT MAX(ICS_ID) FROM ics'
                    cursor.execute(sql_maxID)
                    row = cursor.fetchall()
                    if row[0][0] is None:
                        id = 1
                        # print('This is first data!  ID is : ', id)
                    else:
                        lastID = int(row[0][0])
                        id = lastID + 1
                        # print('Continue to insert , the next ID is : ', id)
                    sql = 'insert into ics values (%s,%s,%s,%s)'
                    cursor.execute(sql,(id,ics,field,B_id))
                    conn.commit()
                else:
                    j += 1
                    # print('This data already exists in the database! \n ----------Update data !---------')
                    pass
        print('采集完毕！共计',k,'条数据：其中新增数据',i,'条，重复数据',j,'条。')
    else:
        pass
if (__name__ == '__main__'):
    global address_list
    address_list = []
    x = 1
    sql_upID = 'SELECT MAX(ICS_B_ID) FROM ics_basic'
    cursor.execute(sql_upID)
    row = cursor.fetchall()
    while x <= row[0][0]:
        sql_addr = 'SELECT address FROM ics_basic WHERE ICS_B_ID =%s'
        cursor.execute(sql_addr,(x))
        addr = cursor.fetchall()
        url = addr[0][0].strip()
        print('初次采集第',x,'条  address:',url)
        catch(url,x,1)
        x += 1
        print('address_list:',len(address_list))
    y = 1
    sql_upID = 'SELECT MAX(ICS_ID) FROM ics'
    cursor.execute(sql_upID)
    row = cursor.fetchall()
    while y <= len(address_list):
        key = y-1
        addr = address_list[key]
        sql_B_id = 'SELECT ICS_B_ID FROM ics WHERE ICS_ID =%s'
        cursor.execute(sql_B_id,(y))
        id = cursor.fetchall()
        B_id = id[0][0]
        print('二次采集第',y,'条  address:',addr)
        catch(addr,B_id,2)
        y += 1

Standard：

from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql

conn=pymysql.connect(host='127.0.0.1',user='root',passwd='******',port=3306
                     ,db='ISO',charset='utf8')
cursor = conn.cursor()
if conn:
    print("database[ISO] 连接成功!")

def catch(url,TC_ID):
    address_url = 'https://www.iso.org/'
    text = urlopen(url).read()
    soup = BeautifulSoup(text,'html.parser')
    table = soup.find('table',id='datatable-tc-projects')
    if table:
        tbody = table.find('tbody')
        i = 0
        j = 0
        k = 0
        for tr in tbody:
            if len(tr) > 1:
                k += 1
                span = tr.find('span')
                # print('span:',span)
                kinds = {'ok-circle':'Published standards'
                    ,'record':'Standards under development'
                    ,'ban-circle':'Withdrawn standards'
                    ,'remove-circle':'Projects deleted'}
                for kind in kinds.keys():
                    # print('kind:',kind)
                    s = kind in str(span)
                    # print(kind,':',s)
                    if s:
                        global span_kind
                        span_kind = kinds[kind]
                        print('span_kind:',span_kind)
                        continue
                td_Standard = tr.find('td',attrs={'data-title':'Standard and/or project'})
                # print('td:',td_Standard)
                standard = td_Standard.a.text.strip()
                print('standard:',standard)
                summary = td_Standard.find('div',attrs={'class':'entry-summary'}).text.strip()
                print('summary:',summary)
                address = address_url[:-1] + td_Standard.a['href']
                print('address:',address)
                td_Stage = tr.find('td',attrs={'data-title':'Stage'})
                # print('td:',td_Stage)
                stage = td_Stage.a.string.strip()
                print('stage:',stage)
                sql_select = 'select count(STANDARD_ID) from standard where STANDARD = %s and address = %s'
                cursor.execute(sql_select,(standard,address))
                select_count = cursor.fetchall()
                # print('select_count:',select_count)
                if select_count[0][0] == 0:
                    i += 1
                    # print('This is new data ! \n ----------Insert data !---------')
                    sql_maxID = 'SELECT MAX(STANDARD_ID) FROM standard'
                    cursor.execute(sql_maxID)
                    row = cursor.fetchall()
                    if row[0][0] is None:
                        STANDARD_ID = 1
                        # print('This is first data!  ID is : ', id)
                    else:
                        lastID = int(row[0][0])
                        STANDARD_ID = lastID + 1
                        # print('Continue to insert , the next ID is : ', id)

                    #将 ICS 数据写入 standard-ics表
                    li = tr.select('li')
                    # print('li:',li)
                    num_li = 0
                    while num_li < len(li):
                        ICS_string = li[num_li].string.strip()
                        num_li += 1
                        sql_ics = 'select ICS_ID from ics where ICS = %s'
                        cursor.execute(sql_ics,(ICS_string))
                        select_ics = cursor.fetchall()[0][0]
                        # ICS_ID = int(str(select_ics).replace(' ',''))
                        print(ICS_string,':',select_ics)
                        sql_standard_ics = 'insert into standard_ics values (%s,%s,%s)'
                        cursor.execute(sql_standard_ics,(STANDARD_ID,num_li,select_ics))
                        conn.commit()

                    #从 stage表 中获取对应的 STAGE_ID
                    sql_stage = 'select STAGE_ID from stage where STAGE = %s'
                    cursor.execute(sql_stage,(stage))
                    select_stage = cursor.fetchall()
                    STAGE = select_stage[0][0]
                    #standard数据入库
                    sql = 'insert into standard values (%s,%s,%s,%s,%s,%s,%s)'
                    cursor.execute(sql,(TC_ID,STANDARD_ID,standard,summary,STAGE,span_kind,address))
                    conn.commit()
                else:
                    j += 1
                    # print('This data already exists in the database! \n ----------Update data !---------')
                    pass
            else:
                pass
        print('TC_ID(',TC_ID,')采集完毕！共计',k,'条数据：其中新增数据',i,'条，重复数据',j,'条。')
    else:
        pass
if (__name__ == '__main__'):
    sql_startID = 'SELECT MAX(TC_ID) FROM standard'
    cursor.execute(sql_startID)
    row_sta = cursor.fetchall()
    if row_sta[0][0] is None:
        startID = 1
    else:
        startID = row_sta[0][0]
    sql_endID = 'SELECT MAX(ID) FROM tc_basic'
    cursor.execute(sql_endID)
    row_end = cursor.fetchall()
    endID = row_end[0][0]
    print(startID,':',endID)
    while startID <= endID:   # tc_basic表中共有 751 条数据
        sql_url = 'SELECT address FROM tc_basic WHERE id =%s'
        cursor.execute(sql_url,(startID))
        addr = cursor.fetchall()
        url = addr[0][0].strip()
        print('采集中：',url)
        catch(url,startID)
        startID += 1

数据库模型：

posted @ 2022-03-09 11:01 一只路过的红阅读(103) 评论(0) 收藏举报

刷新页面返回顶部

一只路过的红

〈2022-3-9〉使用<beautifulsoup>爬取ISO标准网站：已完成，代码记录（问题很多勉强把数据采集完了就这样吧）

公告