个人网站一步一步搭建——(11)使用python爬取博客园数据

开始想着用IronPython库在C#里面直接执行python 方法 发现导包很多时候喜欢报错。到时候我用python 做一个web服务  直接调用接口 

开始爬取博客园数据

爬博客园很简单 都是静态数据

思路。

1 爬取随笔分类 或许URL

2.逐个爬取分类。获取列表 url

3.爬取文章详情,下载图片

4,替换文章详情图片连接

 

上代码 

import requests
import os
from pyquery import PyQuery as pq
def Request(url,data=""):
  herder={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
  req=requests.get(url,headers=herder,params=data)

  if req.status_code==200:
      return req.text
  else:
    return 0



def disfeilei(html):
    doc=pq(html)
    listfls=doc('#sidebar_postcategory').find('li a').items()
    list=[]
    for fl in listfls:
        classfly={
            'url':fl.attr('href'),
            'name':fl.text().split('(')[0],
            'count':fl.text().split('(')[1][0:-1]
        }
        list.append(classfly)
    return list

def Getwzcon(url):
    html=Request(url)
    doc=pq(html)
    con=doc('#main').find('#cnblogs_post_body')
    imglist=con.find("img").items()
    for i in imglist:

       url= i.attr('src')
       index=url.find('797834')+7
       flit=url[index:]
       #保存到项目文件
       path='h:/。net学习/blogs/BLOGS/WebApplication1/images/blogs/'+flit
       #dowimg(url,path)
       #替换图片路径
       i.attr('src',path)
    print(type(con.html()))
    return con.html()
def dowimg(url,path):
    #获取目录
    paths=os.path.dirname(path)
    print(paths)
    #目录是否存在
    if os.path.exists(paths)==False:
        os.makedirs(paths)
    response = requests.get(url).content
    with open(path,'wb')as f:
      f.write(response)
    print("文件下载成功")


def Getwenz(classfly):
    html=Request(classfly)
    doc=pq(html)
    listwzs=doc('#main').find('.entrylist>.entrylistItem').items()
    list=[]
    for i in listwzs:
        title=i.find('.entrylistItemTitle').text()
        url=i.find('.entrylistItemTitle').attr('href')
        desc=i.find('.c_b_p_desc').text()[0:-4]
        entry=i.find('.entrylistItemPostDesc').text().split(" ")
        datatime=entry[2]+" "+entry[3]
        readcount=entry[5][3:-1]
        #获取详情内容
        content=Getwzcon(url)
        #print(entry)

        art={
            'title':title,
            'url':url,
            'desc':desc,
            'datatime':datatime,
            'readcount':readcount,
            'body':content
        }
        print(art)

    return
if __name__ == '__main__':
    url="https://www.cnblogs.com/ruogu/mvc/blog/sidecolumn.aspx"
    data1={'blogApp':'ruogu' }
    textfeilei=Request(url,data1)
    if textfeilei!=0:
        #获取所有分类
      list_fly=disfeilei(textfeilei)
        #遍历分类
      for item in list_fly:
        #添加数据库
        #print(item)
        #获取文章详情
        Getwenz(item['url'])

  

posted @ 2019-07-12 14:16  跑着的小强  阅读(561)  评论(0编辑  收藏  举报