爬虫之爬取糗事百科
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import urllib2,re
from bs4 import BeautifulSoup
def getContentOrComment(argurl):
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    headers = {'User-Agent':user_agent}
    #加上头部信息,反爬虫
    req = urllib2.Request(url=argurl,headers=headers)
    #try:
    response = urllib2.urlopen(req)  #打开网址
    content = response.read() #读取源代码
        #print content
    #except Exception,e:
        #content = None
    return content
articleUrl = 'http://www.qiushibaike.com/textnew/page/%d'   #文章地址
commentUrl = 'http://www.qiushibaike.com/article/%s'  #评论地址
page = 0
while True:
    raw = raw_input('点击enter查看或者输入exit退出,请输入你的选择:')
    if raw == 'exit':
        break
    page += 1
    Url = articleUrl % page
    print Url
    articlePage = getContentOrComment(Url)
    articleFloor = 1
    soup = BeautifulSoup(articlePage,'html.parser')  #解析网页
    for string in soup.find_all(attrs='article block untagged mb15'):
        commentId = str(string.get('id')).strip().split('_')[2]
        #print commentId
        print '\n'
        print articleFloor,'.',string.find(attrs='content').get_text().strip()
        articleFloor +=1
        #获取评论
        commentPage = getContentOrComment(commentUrl % commentId)
        if commentPage is None:
            continue
        soupComment = BeautifulSoup(commentPage,'html.parser')
        commentFloor = 1
        for comment in soupComment.find_all(attrs='body'):
            print '     ',commentFloor,'楼回复:',comment.get_text().strip()
            commentFloor +=1
 
                    
                
 
                
            
         浙公网安备 33010602011771号
浙公网安备 33010602011771号