〈2022-2-14〉使用<beautifulsoup>爬取ISO标准网站:基础爬取

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://www.iso.org/standards-catalogue/browse-by-tc.html'
addrss_url = 'https://www.iso.org'
text = urlopen(url).read()
soup = BeautifulSoup(text,'html.parser')
table = soup.find('table',id='datatable-committees')
# print('table:',table)
tbody = table.find('tbody')
# print('tbody:',tbody)
TC = set()
for tr in tbody:
    if len(tr) > 1:
        # print('tr:',tr)
        # print('-----------------------')
        td = tr.findAll('td')
        title = td[1].string.strip()
        # print('title:',title)
        title1 = set()
        title1.add('{}(title:{})'.format(tr.a.string,title))
        TC.add('{}(address:{})'.format(title1,addrss_url+tr.a['href']))
print('\n'.join(sorted(TC,key=str.lower)))
# print('TC:',TC)
print('采集完毕!')

 

posted @ 2022-02-14 16:37  一只路过的红  阅读(63)  评论(0)    收藏  举报