爬取restful接口数据(cnblogs首页)

实现的功能

获取www.cnblogs.com首页的帖子。

 

1、安装插件

采用bs4.BeautifulSoup替代正则表达式,安装命令:

pip install bs4

2、爬取接口数据

接口参数查看:

crawl_html.py

import urllib.parse
import urllib.request

def getHtml(url, values):
    # user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    # headers = {'User-Agent':user_agent}
    data = urllib.parse.urlencode(values)
    response_result = urllib.request.urlopen(url+'?'+data).read()
    html = response_result.decode('utf-8')
    return html

# 获取www.cnblogs.com首页的帖子,参数都来自Netword中
def requestCnblogs(index):
    print('请求数据')
    url = 'http://www.cnblogs.com/mvc/AggSite/PostList.aspx'
    value= {
         'CategoryId':808,
         'CategoryType' : 'SiteHome',
         'ItemListActionName' :'PostList',
         'PageIndex' : index,
         'ParentCategoryId' : 0,
        'TotalPostCount' : 4000
    }
    result = getHtml(url,value)
    return result

if __name__ == "__main__":
    result_html = requestCnblogs(1)
    print(result_html)

3、解析数据

content_parser.py

from bs4 import BeautifulSoup
import crawl_html
import re

#解析最外层
def blogParser(index):
  cnblogs = crawl_html.requestCnblogs(index)
  soup = BeautifulSoup(cnblogs, 'html.parser')
  all_div = soup.find_all('div', attrs={'class': 'post_item_body'}, limit=20)

  blogs = []
  #循环div获取详细信息
  for item in all_div:
      blog = analyzeBlog(item)
      blogs.append(blog)

  return blogs

#解析每一条数据
def analyzeBlog(item):
    result = {}
    a_title = find_all(item,'a','titlelnk')
    if a_title is not None:
        # 博客标题
        result["title"] = a_title[0].string
        # 博客链接
        result["href"] = a_title[0]['href']
    p_summary = find_all(item,'p','post_item_summary')
    if p_summary is not None:
        # 简介
        result["summary"] = p_summary[0].text
    footers = find_all(item,'div','post_item_foot')
    footer = footers[0]
    # 作者
    result["author"] = footer.a.string
    # 作者url
    result["author_url"] = footer.a['href']
    str = footer.text
    time = re.findall(r"发布于 .+? .+? ", str)
    result["create_time"] = time[0].replace('发布于 ','')

    comment_str = find_all(footer,'span','article_comment')[0].a.string
    result["comment_num"] = re.search(r'\d+', comment_str).group()

    view_str = find_all(footer,'span','article_view')[0].a.string
    result["view_num"] = re.search(r'\d+', view_str).group()

    return result

def find_all(item,attr,c):
    return item.find_all(attr,attrs={'class':c},limit=1)

if __name__ == "__main__":
    result_cnblogs = blogParser(1)
    print(result_cnblogs)

4、存储数据

存储为json文件,write2json.py

import content_parser
import os
import datetime
import json

def saveBlogs():
    for i in range(1,3):
        print('request for '+str(i)+'...')
        # 读取数据
        blogs = content_parser.blogParser(i)
        # 创建文件
        path = createFile()
        # 保存到文件
        writeToTxt(blogs, path+'/blog_'+ str(i) +'.json')
        print(''+ str(i) +'页已经完成')
    return 'success'

def createFile():
    date = datetime.datetime.now().strftime('%Y-%m-%d')
    path = '/'+date
    if os.path.exists(path):
        return path
    else:
        os.mkdir(path)
        return path

def writeToTxt(list_name, file_path):
    try:
# 打开一个文件,模式为w+ fp
= open(file_path,"w+",encoding='utf-8') # str()可以把对象转化为字符串 fp.write(str(list_name)) fp.close() except IOError: print("fail to open file") if __name__ == "__main__": result_save = saveBlogs() print(result_save)

 

posted @ 2018-04-20 00:01  wbinbin  阅读(715)  评论(0)    收藏  举报