爱思想(search url)

代码:

 

import pdfkit
import requests
from bs4 import BeautifulSoup
import requests
from chardet import detect
from bs4 import BeautifulSoup
def get_url_list_3(urllist):
    urls = []
    news_urls = []
    for url in urllist:
        domain = 'http://www.aisixiang.com'
        res = requests.get(url)
        time.sleep(2)
        from bs4 import BeautifulSoup
        body = BeautifulSoup(res.text.encode('latin-1').decode('gbk',"ignore"),'html.parser')
        if aa in str(body):
            body.select('.list_page')[0]
            for wenz2 in body.select('a'):
                url = domain + wenz2['href']
                urls.append(url)
    urllistt = urls + urllist
    for urls in urllistt:
        if urls not in news_urls:
            news_urls.append(urls)
    new_urlslist = [i for i in news_urls if "-" in i]
    #news_urls.remove('http://www.aisixiang.comjavascript:void();')
    return new_urlslist

def get_url_list(url):
    domain = 'http://www.aisixiang.com'
    res = requests.get(url)
    file = res.text.encode('latin-1').decode('gbk',"ignore")
    soup = BeautifulSoup(file,'html.parser')
    urls = []
    #for wenz in soup.select('a[target="_blank"]'):
    for wenz in soup.select('.search_list a[target="_blank"]'):
        if len(wenz['href']) < 36:
            link = wenz['href'].replace('.html','-1.html')
            url = domain + link
            urls.append(url)
    new_urlslist = [i for i in urls if "-" in i]

    return new_urlslist
    print(len(new_urlslist))

urlslist = []
for url in ['http://www.aisixiang.com/data/search.php?keyWords=%C3%AB%D4%F3%B6%AB&searchfield=author&page=1','http://www.aisixiang.com/data/search.php?keyWords=%C3%AB%D4%F3%B6%AB&searchfield=author&page=2']:
    urls = get_url_list(url)
    urlslist = urlslist + urls
print(urlslist)
print(len(urlslist))

  结果:

['http://www.aisixiang.com/data/99053-1.html', 'http://www.aisixiang.com/data/87555-1.html', 'http://www.aisixiang.com/data/84553-1.html', 'http://www.aisixiang.com/data/59290-1.html', 'http://www.aisixiang.com/data/59286-1.html', 'http://www.aisixiang.com/data/38686-1.html', 'http://www.aisixiang.com/data/33501-1.html', 'http://www.aisixiang.com/data/33461-1.html', 'http://www.aisixiang.com/data/25897-1.html', 'http://www.aisixiang.com/data/23238-1.html', 'http://www.aisixiang.com/data/3250-1.html', 'http://www.aisixiang.com/data/2579-1.html', 'http://www.aisixiang.com/data/2578-1.html', 'http://www.aisixiang.com/data/2577-1.html', 'http://www.aisixiang.com/data/2576-1.html', 'http://www.aisixiang.com/data/2575-1.html', 'http://www.aisixiang.com/data/2574-1.html', 'http://www.aisixiang.com/data/2573-1.html', 'http://www.aisixiang.com/data/2572-1.html', 'http://www.aisixiang.com/data/2571-1.html', 'http://www.aisixiang.com/data/2570-1.html', 'http://www.aisixiang.com/data/2568-1.html', 'http://www.aisixiang.com/data/2567-1.html', 'http://www.aisixiang.com/data/2566-1.html', 'http://www.aisixiang.com/data/2565-1.html', 'http://www.aisixiang.com/data/2564-1.html', 'http://www.aisixiang.com/data/2548-1.html', 'http://www.aisixiang.com/data/2399-1.html', 'http://www.aisixiang.com/data/2384-1.html', 'http://www.aisixiang.com/data/1622-1.html', 'http://www.aisixiang.com/data/1618-1.html', 'http://www.aisixiang.com/data/1617-1.html']

  排序:

urlll = get_url_list_3(urlslist)
urlll.sort()
urlll

  

posted @ 2017-08-01 17:23  侠之大者kamil  阅读(224)  评论(0)    收藏  举报