Pathon 有选择地爬取aaai19中的文章

抓取文章,下载到指定目录

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import re


# 生成文件时要对文件名字做处理
def recorrect_title(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title)  # 替换为下划线
    return new_title


save_path = 'E://文档//AAAI2019//'
url = 'http://www.aaai.org/Library/AAAI/aaai19contents.php'
find_text = 'Segmentation'
resp = requests.get(url)
html_doc = resp.text
soup = BeautifulSoup(html_doc, 'html.parser')
content = soup.find(class_='content')
soup1 = BeautifulSoup(content.prettify(), 'html.parser')
text_arr = soup1.findAll(class_='left')
find_text_arr = [x for x in text_arr if x.text.find(find_text) != -1]
down_url_arr = [[recorrect_title(x.find('a').text.replace('\n', '').strip()),
                 x.find('i').text.replace('\n', '').strip(),
                 x.find_all('a')[-1].get('href')] for x in find_text_arr]
print(down_url_arr)
for i in tqdm(down_url_arr):
    r = requests.get(i[-1])
    with open(save_path + i[0] + '.pdf', "wb") as code:
        if not os.path.exists(save_path + i[0] + '.pdf'):
            code.write(r.content)

posted @ 2019-11-07 10:38  赫凯  阅读(37)  评论(0)    收藏  举报