爱思想(search url)
代码:
import pdfkit
import requests
from bs4 import BeautifulSoup
import requests
from chardet import detect
from bs4 import BeautifulSoup
def get_url_list_3(urllist):
urls = []
news_urls = []
for url in urllist:
domain = 'http://www.aisixiang.com'
res = requests.get(url)
time.sleep(2)
from bs4 import BeautifulSoup
body = BeautifulSoup(res.text.encode('latin-1').decode('gbk',"ignore"),'html.parser')
if aa in str(body):
body.select('.list_page')[0]
for wenz2 in body.select('a'):
url = domain + wenz2['href']
urls.append(url)
urllistt = urls + urllist
for urls in urllistt:
if urls not in news_urls:
news_urls.append(urls)
new_urlslist = [i for i in news_urls if "-" in i]
#news_urls.remove('http://www.aisixiang.comjavascript:void();')
return new_urlslist
def get_url_list(url):
domain = 'http://www.aisixiang.com'
res = requests.get(url)
file = res.text.encode('latin-1').decode('gbk',"ignore")
soup = BeautifulSoup(file,'html.parser')
urls = []
#for wenz in soup.select('a[target="_blank"]'):
for wenz in soup.select('.search_list a[target="_blank"]'):
if len(wenz['href']) < 36:
link = wenz['href'].replace('.html','-1.html')
url = domain + link
urls.append(url)
new_urlslist = [i for i in urls if "-" in i]
return new_urlslist
print(len(new_urlslist))
urlslist = []
for url in ['http://www.aisixiang.com/data/search.php?keyWords=%C3%AB%D4%F3%B6%AB&searchfield=author&page=1','http://www.aisixiang.com/data/search.php?keyWords=%C3%AB%D4%F3%B6%AB&searchfield=author&page=2']:
urls = get_url_list(url)
urlslist = urlslist + urls
print(urlslist)
print(len(urlslist))
结果:
['http://www.aisixiang.com/data/99053-1.html', 'http://www.aisixiang.com/data/87555-1.html', 'http://www.aisixiang.com/data/84553-1.html', 'http://www.aisixiang.com/data/59290-1.html', 'http://www.aisixiang.com/data/59286-1.html', 'http://www.aisixiang.com/data/38686-1.html', 'http://www.aisixiang.com/data/33501-1.html', 'http://www.aisixiang.com/data/33461-1.html', 'http://www.aisixiang.com/data/25897-1.html', 'http://www.aisixiang.com/data/23238-1.html', 'http://www.aisixiang.com/data/3250-1.html', 'http://www.aisixiang.com/data/2579-1.html', 'http://www.aisixiang.com/data/2578-1.html', 'http://www.aisixiang.com/data/2577-1.html', 'http://www.aisixiang.com/data/2576-1.html', 'http://www.aisixiang.com/data/2575-1.html', 'http://www.aisixiang.com/data/2574-1.html', 'http://www.aisixiang.com/data/2573-1.html', 'http://www.aisixiang.com/data/2572-1.html', 'http://www.aisixiang.com/data/2571-1.html', 'http://www.aisixiang.com/data/2570-1.html', 'http://www.aisixiang.com/data/2568-1.html', 'http://www.aisixiang.com/data/2567-1.html', 'http://www.aisixiang.com/data/2566-1.html', 'http://www.aisixiang.com/data/2565-1.html', 'http://www.aisixiang.com/data/2564-1.html', 'http://www.aisixiang.com/data/2548-1.html', 'http://www.aisixiang.com/data/2399-1.html', 'http://www.aisixiang.com/data/2384-1.html', 'http://www.aisixiang.com/data/1622-1.html', 'http://www.aisixiang.com/data/1618-1.html', 'http://www.aisixiang.com/data/1617-1.html']
排序:
urlll = get_url_list_3(urlslist) urlll.sort() urlll
公众号请关注:侠之大者

浙公网安备 33010602011771号