python爬取cvpr论文2021
import pymysql #连接数据库 import requests import lxml as lxml from bs4 import BeautifulSoup def vall(morau,k): page = morau.text.split('=')[k] page = page.split('}') page = page[0].split('{') return page[1] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77' } conn=pymysql.connect( host = '127.0.0.1' # 连接名称,默认127.0.0.1 ,user = 'root' # 用户名 ,passwd='12345' # 密码 ,port= 3306 # 端口,默认为3306 ,db='arcticle' # 数据库名称 ,charset='utf8' # 字符编码 ) cur = conn.cursor() # 生成游标对象 url = "http://openaccess.thecvf.com/WACV2021" html = requests.get(url) soup = BeautifulSoup(html.content, 'html.parser') # #print(soup) soup.a.contents == 'pdf' #print(soup.a.contents) pdfs = soup.findAll(name="a", text="pdf") morau=soup.findAll(name='div',attrs={'class':'bibref pre-white-space'}) #print(morau) m=0#爬取第几条信息 ls1=[] while m<len(morau): info = {}#放在循环内部 info['page'] = vall(morau=morau[m], k=-1) info['year'] = vall(morau=morau[m], k=-2) info['month'] = vall(morau=morau[m], k=-3) info['booktitle'] = vall(morau=morau[m], k=-4) info['title'] = vall(morau=morau[m], k=-5) info['author'] = vall(morau=morau[m], k=-6) # print(info) ls1.append(info) #print(ls) #print('------------------------------') m+=1 #ls1.reverse()#原地翻转,ls=ls.reverse没有返回值 #print(pdfs) lis = [] jianjie = "" #n= len(ls1)-1 n=0 for item in ls1: print(item) print('-------------------------------------------------------------') #print(ls[n])#越界,因为应该减1,从零开始 for i, pdf in enumerate(pdfs): pdf_name = pdf["href"].split('/')[-1] name = pdf_name.split('.')[0].replace("_WACV_2021_paper", "") link = "http://openaccess.thecvf.com/content/WACV2021/html/" + name + "_WACV_2021_paper.html" url1 = link html1 = requests.get(url1) soup1 = BeautifulSoup(html1.content, 'html.parser') weizhi = soup1.find('div', attrs={'id': 'abstract'}) if weizhi: jianjie = weizhi.get_text(); print("这是第" + str(i) + "条数据") keyword = str(name).split('_') keywords = '' for k in range(len(keyword)): if (k == 0): keywords += keyword[k] else: keywords += ',' + keyword[k] info = dict(ls1[n]) n=n+1; info['longtitle'] = name info['link'] = link #print(link) info['abstract'] = jianjie info['keywords'] = keywords print(info) lis.append(info) print(lis) cursor = conn.cursor() for i in range(len(lis)): cols = ", ".join('`{}`'.format(k) for k in lis[i].keys()) print(cols) # '`name`, `age`' val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys()) print(val_cols) # '%(name)s, %(age)s' sql = "insert into lunwen(%s) values(%s)" res_sql = sql % (cols, val_cols) print(res_sql) cursor.execute(res_sql, lis[i]) # 将字典a传入 conn.commit() num = 1 print(num) print("成功")
import pymysql #连接数据库 import requests import lxml as lxml from bs4 import BeautifulSoup def vall(morau,k): page = morau.text.split('=')[k] page = page.split('}') page = page[0].split('{') return page[1] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77' } conn=pymysql.connect( host = '127.0.0.1' # 连接名称,默认127.0.0.1 ,user = 'root' # 用户名 ,passwd='12345' # 密码 ,port= 3306 # 端口,默认为3306 ,db='arcticle' # 数据库名称 ,charset='utf8' # 字符编码 ) cur = conn.cursor() # 生成游标对象 url = "http://openaccess.thecvf.com/WACV2021" html = requests.get(url) soup = BeautifulSoup(html.content, 'html.parser') # #print(soup) soup.a.contents == 'pdf' #print(soup.a.contents) pdfs = soup.findAll(name="a", text="pdf") morau=soup.findAll(name='div',attrs={'class':'bibref pre-white-space'}) #print(morau) m=0#爬取第几条信息 ls1=[] while m<len(morau): info = {}#放在循环内部 info['page'] = vall(morau=morau[m], k=-1) info['year'] = vall(morau=morau[m], k=-2) info['month'] = vall(morau=morau[m], k=-3) info['booktitle'] = vall(morau=morau[m], k=-4) info['title'] = vall(morau=morau[m], k=-5) info['author'] = vall(morau=morau[m], k=-6) # print(info) ls1.append(info) #print(ls) #print('------------------------------') m+=1 #ls1.reverse()#原地翻转,ls=ls.reverse没有返回值 #print(pdfs) lis = [] jianjie = "" #n= len(ls1)-1 n=0 for item in ls1: print(item) print('-------------------------------------------------------------') #print(ls[n])#越界,因为应该减1,从零开始 for i, pdf in enumerate(pdfs): pdf_name = pdf["href"].split('/')[-1] name = pdf_name.split('.')[0].replace("_WACV_2021_paper", "") link = "http://openaccess.thecvf.com/content/WACV2021/html/" + name + "_WACV_2021_paper.html" url1 = link html1 = requests.get(url1) soup1 = BeautifulSoup(html1.content, 'html.parser') weizhi = soup1.find('div', attrs={'id': 'abstract'}) if weizhi: jianjie = weizhi.get_text(); print("这是第" + str(i) + "条数据") keyword = str(name).split('_') keywords = '' for k in range(len(keyword)): if (k == 0): keywords += keyword[k] else: keywords += ',' + keyword[k] info = dict(ls1[n]) n=n+1; info['longtitle'] = name info['link'] = link #print(link) info['abstract'] = jianjie info['keywords'] = keywords print(info) lis.append(info) print(lis) cursor = conn.cursor() for i in range(len(lis)): cols = ", ".join('`{}`'.format(k) for k in lis[i].keys()) print(cols) # '`name`, `age`' val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys()) print(val_cols) # '%(name)s, %(age)s' sql = "insert into lunwen(%s) values(%s)" res_sql = sql % (cols, val_cols) print(res_sql) cursor.execute(res_sql, lis[i]) # 将字典a传入 conn.commit() num = 1 print(num) print("成功")