爱思想(list)

代码

import os
aa = '点击此处阅读下一页'
import re
import time
import logging
import pdfkit
import requests
from bs4 import BeautifulSoup
import requests
from chardet import detect
from bs4 import BeautifulSoup
def get_url_list_3(urllist):
    urls = []
    news_urls = []
    for url in urllist:
        domain = 'http://www.aisixiang.com'
#        urls = []
        res = requests.get(url)
        time.sleep(2)
        from bs4 import BeautifulSoup
        body = BeautifulSoup(res.text.encode('latin-1').decode('gbk',"ignore"),'html.parser')
        if aa in str(body):
            body.select('.list_page')[0]
            for wenz2 in body.select('a'):
                url = domain + wenz2['href']
                urls.append(url)
    urllistt = urls + urllist
    for urls in urllistt:
        if urls not in news_urls:
            news_urls.append(urls)
    new_urlslist = [i for i in news_urls if "-" in i]
    #news_urls.remove('http://www.aisixiang.comjavascript:void();')
    return new_urlslist

def get_url_list():
    domain = 'http://www.aisixiang.com'
    res = requests.get('http://www.aisixiang.com/thinktank/zhouqiren.html')
    file = res.text.encode('latin-1').decode('gbk',"ignore")
    soup = BeautifulSoup(file,'html.parser')
    urls = []
    for wenz in soup.select('a[target="_blank"]'):
    #for wenz in soup.select('.search_list a[target="_blank"]'):
        if len(wenz['href']) < 36:
            link = wenz['href'].replace('.html','-1.html')
            url = domain + link
            urls.append(url)
    new_urlslist = [i for i in urls if "-" in i]

    return new_urlslist
    print(len(new_urlslist))

#urls = get_url_list_3(get_url_list())
urls = get_url_list()
print(urls)

代码

urlll = get_url_list_3(urls)
print(urlll)

 

posted @ 2017-08-01 17:27  侠之大者kamil  阅读(187)  评论(0)    收藏  举报