爱思想(list)
代码
import os aa = '点击此处阅读下一页' import re import time import logging import pdfkit import requests from bs4 import BeautifulSoup import requests from chardet import detect from bs4 import BeautifulSoup def get_url_list_3(urllist): urls = [] news_urls = [] for url in urllist: domain = 'http://www.aisixiang.com' # urls = [] res = requests.get(url) time.sleep(2) from bs4 import BeautifulSoup body = BeautifulSoup(res.text.encode('latin-1').decode('gbk',"ignore"),'html.parser') if aa in str(body): body.select('.list_page')[0] for wenz2 in body.select('a'): url = domain + wenz2['href'] urls.append(url) urllistt = urls + urllist for urls in urllistt: if urls not in news_urls: news_urls.append(urls) new_urlslist = [i for i in news_urls if "-" in i] #news_urls.remove('http://www.aisixiang.comjavascript:void();') return new_urlslist def get_url_list(): domain = 'http://www.aisixiang.com' res = requests.get('http://www.aisixiang.com/thinktank/zhouqiren.html') file = res.text.encode('latin-1').decode('gbk',"ignore") soup = BeautifulSoup(file,'html.parser') urls = [] for wenz in soup.select('a[target="_blank"]'): #for wenz in soup.select('.search_list a[target="_blank"]'): if len(wenz['href']) < 36: link = wenz['href'].replace('.html','-1.html') url = domain + link urls.append(url) new_urlslist = [i for i in urls if "-" in i] return new_urlslist print(len(new_urlslist)) #urls = get_url_list_3(get_url_list()) urls = get_url_list() print(urls)
代码
urlll = get_url_list_3(urls) print(urlll)
公众号请关注:侠之大者

浙公网安备 33010602011771号