爬取restful接口数据(cnblogs首页)
实现的功能
获取www.cnblogs.com首页的帖子。

1、安装插件
采用bs4.BeautifulSoup替代正则表达式,安装命令:
pip install bs4
2、爬取接口数据
接口参数查看:

crawl_html.py
import urllib.parse import urllib.request def getHtml(url, values): # user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' # headers = {'User-Agent':user_agent} data = urllib.parse.urlencode(values) response_result = urllib.request.urlopen(url+'?'+data).read() html = response_result.decode('utf-8') return html # 获取www.cnblogs.com首页的帖子,参数都来自Netword中 def requestCnblogs(index): print('请求数据') url = 'http://www.cnblogs.com/mvc/AggSite/PostList.aspx' value= { 'CategoryId':808, 'CategoryType' : 'SiteHome', 'ItemListActionName' :'PostList', 'PageIndex' : index, 'ParentCategoryId' : 0, 'TotalPostCount' : 4000 } result = getHtml(url,value) return result if __name__ == "__main__": result_html = requestCnblogs(1) print(result_html)
3、解析数据
content_parser.py
from bs4 import BeautifulSoup import crawl_html import re #解析最外层 def blogParser(index): cnblogs = crawl_html.requestCnblogs(index) soup = BeautifulSoup(cnblogs, 'html.parser') all_div = soup.find_all('div', attrs={'class': 'post_item_body'}, limit=20) blogs = [] #循环div获取详细信息 for item in all_div: blog = analyzeBlog(item) blogs.append(blog) return blogs #解析每一条数据 def analyzeBlog(item): result = {} a_title = find_all(item,'a','titlelnk') if a_title is not None: # 博客标题 result["title"] = a_title[0].string # 博客链接 result["href"] = a_title[0]['href'] p_summary = find_all(item,'p','post_item_summary') if p_summary is not None: # 简介 result["summary"] = p_summary[0].text footers = find_all(item,'div','post_item_foot') footer = footers[0] # 作者 result["author"] = footer.a.string # 作者url result["author_url"] = footer.a['href'] str = footer.text time = re.findall(r"发布于 .+? .+? ", str) result["create_time"] = time[0].replace('发布于 ','') comment_str = find_all(footer,'span','article_comment')[0].a.string result["comment_num"] = re.search(r'\d+', comment_str).group() view_str = find_all(footer,'span','article_view')[0].a.string result["view_num"] = re.search(r'\d+', view_str).group() return result def find_all(item,attr,c): return item.find_all(attr,attrs={'class':c},limit=1) if __name__ == "__main__": result_cnblogs = blogParser(1) print(result_cnblogs)
4、存储数据
存储为json文件,write2json.py
import content_parser import os import datetime import json def saveBlogs(): for i in range(1,3): print('request for '+str(i)+'...') # 读取数据 blogs = content_parser.blogParser(i) # 创建文件 path = createFile() # 保存到文件 writeToTxt(blogs, path+'/blog_'+ str(i) +'.json') print('第'+ str(i) +'页已经完成') return 'success' def createFile(): date = datetime.datetime.now().strftime('%Y-%m-%d') path = '/'+date if os.path.exists(path): return path else: os.mkdir(path) return path def writeToTxt(list_name, file_path): try:
# 打开一个文件,模式为w+ fp = open(file_path,"w+",encoding='utf-8') # str()可以把对象转化为字符串 fp.write(str(list_name)) fp.close() except IOError: print("fail to open file") if __name__ == "__main__": result_save = saveBlogs() print(result_save)

浙公网安备 33010602011771号