学习进度条100
import requests from lxml import etree import pymysql def getdata(url,year): # 请求CVPR主页 page_text = requests.get(url).text parser = etree.HTMLParser(encoding="utf-8") tree = etree.HTML(page_text, parser=parser) # 爬取论文连接 hrefs = tree.xpath('//dt[@class="ptitle"]/a/@href') #https://openaccess.thecvf.com/content_cvpr_2018/html/Yang_Learning_Face_Age_CVPR_2018_paper.html print(len(hrefs)) # 爬取论文信息 titles = [] pdfs = [] abstracts = [] authors = [] keywords = [] for href in hrefs: db = pymysql.connect(host="gongyunlong.mysql.rds.aliyuncs.com", user="g2431", password="Gg12512544", database="cvpr1") href = "https://openaccess.thecvf.com/" + href page_text = requests.get(href).text tree_link = etree.HTML(page_text, parser=parser) title = tree_link.xpath('/html/body/div/dl/dd/div[@id="papertitle"]/text()') title[0] = title[0].strip() titles += title title[0] = title[0].replace(":", "") words = title[0].split() keyword = "" for word in words: if checkword(word): save_keywords(pymysql.connect(host="gongyunlong.mysql.rds.aliyuncs.com", user="g2431", password="Gg12512544", database="cvpr1"), word) keyword += word + " " keywords.append(keyword) pdf = tree_link.xpath('/html/body/div/dl/dd/a[contains(text(),"pdf")]/@href') pdf[0] = pdf[0].replace("../../", "https://openaccess.thecvf.com/") pdfs += pdf abstract = tree_link.xpath('/html/body/div/dl/dd/div[@id="abstract"]/text()') abstract[0] = abstract[0].strip() abstracts += abstract author = tree_link.xpath('/html/body/div/dl/dd/div/b/i/text()') authors += author # print(title) # print(author) # print(pdf) # print(abstract) save(db, title[0], author[0], abstract[0], href, keyword,year) print(titles) print(hrefs) print(authors) print(abstracts) print(pdfs) def save(db, title, author, abstract, link, keyword,year): # 使用cursor()方法获取操作游标 cursor = db.cursor() # SQL 插入语句 sql = "INSERT INTO papers(title, authors, abstract_text, original_link, keywords,year) \ VALUES ('%s', '%s', '%s', '%s', '%s','%d')" % \ (title, author, abstract, link, keyword,year) try: # 执行sql语句 cursor.execute(sql) # 执行sql语句 db.commit() except: # 发生错误时回滚 db.rollback() # 关闭数据库连接 db.close() def save_keywords(db, keyword): # 使用cursor()方法获取操作游标 cursor = db.cursor() # SQL 插入语句 sql = "INSERT INTO keywords(keyword) VALUES ('%s')" % (keyword) try: # 执行sql语句 cursor.execute(sql) # 执行sql语句 db.commit() except: # 发生错误时回滚 db.rollback() # 关闭数据库连接 db.close() def checkword(word): invalid_words = ['the', 'a', 'an', 'and', 'by', 'of', 'in', 'on', 'is', 'to', "as", "from", "for", "with", "that", "have", "by", "on", "upon", "about", "above", "across", "among", "ahead", "after", "a", "analthough", "at", "also", "along", "around", "always", "away", "anyup", "under", "untilbefore", "between", "beyond", "behind", "because", "what", "when", "would", "could", "who", "whom", "whose", "which", "where", "why", "without", "whether", "down", "during", "despite", "over", "off", "only", "other", "out", "than", "the", "thenthrough", "throughout", "that", "these", "this", "those", "there", "therefore", "some", "such", "since", "so", "can", "many", "much", "more", "may", "might", "must", "ever", "even", "every", "each" ,"with","A","With","From","Question",'Question','question',"question","Questions",'Questions','questions',"questions"] if word.lower() in invalid_words: return False else: return True if __name__ == '__main__': getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-19",2018) getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-20",2018) getdata("https://openaccess.thecvf.com/CVPR2018?day=2018-06-21",2018) getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-18",2019) getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-19",2019) getdata("https://openaccess.thecvf.com/CVPR2019?day=2019-06-20",2019) getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-16",2020) getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-17",2020) getdata("https://openaccess.thecvf.com/CVPR2020?day=2020-06-18",2020)
提取热词
 
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号