requests简单爬取佩奇老师Python目录下的文章内容

 

 

 

#coding:utf-8

import requests
import xlrd
from xlutils.copy import copy
import time
from bs4 import BeautifulSoup


#定义header头(简单)
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
#起始URL
start_url = "http://www.cnblogs.com/wupeiqi/articles/4938499.html"



def run(start_url):
    # 发送get请求
    html = requests.get(url=start_url,headers=headers,timeout=30)
    # 解析成dom树
    html = BeautifulSoup(html.text,'html.parser')
    #提取出所有文章链接
    urls = []
    for item in html.select("#cnblogs_post_body a"):
        urls.append(item.attrs.get("href"))
    #循环链接列表继续发送URL请求文章详细页
    print("共找到—— %s篇 ——文章"%len(urls))
    count = 1
    for url in urls:
        print("-------正在下载文章 %s/%s--------"%(count,len(urls)))
        count+=1
        article = requests.get(url=url,headers=headers,timeout=30)
        process_article(article,url)
    print("下载完成 谢谢使用!".center(80,"-"))


def save_article(title,content,url):
    #将数据存到表格
    workbook = xlrd.open_workbook('article.xls')   #打开表格
    nrows = int(workbook.sheets()[0].nrows)   #获取表格现有的行数
    workbooknew = copy(workbook)
    ws = workbooknew.get_sheet(0)   #获取第一张工作表
    ws.write(nrows,0,str(title))   #写入标题
    ws.write(nrows,1,str(url)) #这里只保存了标题和链接,内容太长表格里放不下
    workbooknew.save('article.xls')   #更新表格

def process_article(article,url):
    # 解析成dom树
    article = BeautifulSoup(article.text,'html.parser')
    #提取出文章的标题和内容
    title = article.select("#cb_post_title_url")[0].text
    print("正在下载文章----->>  %s"%title)
    content = article.select("#topics .blogpost-body")   
  #.children
    save_article(title,content,url)



if __name__ == "__main__":
    run(start_url)

  

posted @ 2018-09-14 10:02  chen~先生  阅读(610)  评论(0)    收藏  举报