爬取【王琳杰-博客园】的博文

  1. 获取页面信息,用XPath  做数据提取

  2. 获取每个blog里的用标题、正文、阅读次数信息

  3. 保存到 json 文件内

# -*- coding:utf-8 -*-

import urllib
import urllib2
from lxml import etree

def loadPage(url):
    """
        作用:根据url发送请求,获取服务器响应文件
        url: 需要爬取的url地址
    """
    #print url
    #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}

    request = urllib2.Request(url)
    html = urllib2.urlopen(request).read()
    # 解析HTML文档为HTML DOM模型
    content = etree.HTML(html)
    #print content
    # 返回所有匹配成功的列表集合
    link_list = content.xpath('//div[@class="postTitle"]/a/@href')
    for link in link_list:
        
        #print link
        loadpage(link)

# 取出每个文章的链接
def loadpage(link):
    headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    request = urllib2.Request(link, headers = headers)
    html = urllib2.urlopen(request).read()
    # 解析
    content = etree.HTML(html)
    # 取出每层发送的文章链接集合
    link_list = content.xpath('//div[@class="post"]//a')[0].text
    print link_list
    # 取出每个标题,正文,阅读的连接
    #for link in link_list:
    #   print link
    #   write(link)

def blogSpider(url, beginPage, endPage):
    """
        作用:贴吧爬虫调度器,负责组合处理每个页面的url
        url : 贴吧url的前部分
        beginPage : 起始页
        endPage : 结束页
    """
    for page in range(beginPage, endPage + 1):
        pn = page
        fullurl = url + str(pn)
        #print fullurl
        loadPage(fullurl)
        #print html

        print "谢谢使用"

if __name__ == "__main__":
    beginPage = int(raw_input("请输入起始页:"))
    endPage = int(raw_input("请输入结束页:"))

    url = "http://www.cnblogs.com/wanglinjie/default.html?page="
    blogSpider(url, beginPage, endPage)

 待完。。。

python3中:

 1 import requests
 2 from lxml import etree
 3 
 4 def loadPage(url):
 5     """
 6         作用:根据url发送请求,获取服务器响应文件
 7         url: 需要爬取的url地址
 8     """
 9     #print url
10     #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
11 
12     #request = urllib2.Request(url)
13     #html = urllib2.urlopen(request).read()
14     reseponse = requests.get(url)
15     # 解析HTML文档为HTML DOM模型
16     #content = etree.HTML(html)
17     html = reseponse.content
18     content = etree.HTML(html)
19     #content = reseponse.text
20     #print(content)
21     # 返回所有匹配成功的列表集合
22     link_list = content.xpath('//div[@class="postTitle"]/a/@href')
23     for link in link_list:
24         
25         #print link
26         loadpage(link)
27 
28 # 取出每个文章的链接
29 def loadpage(link):
30     headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
31     #request = urllib2.Request(link, headers = headers)
32     #html = urllib2.urlopen(request).read()
33     reseponse = requests.get(link, headers = headers)
34     html = reseponse.content
35     # 解析
36 
37     content = etree.HTML(html)
38     # 取出每层发送的文章链接集合
39     link_list = content.xpath('//div[@class="post"]//a')[0].text
40     print(link_list)
41     # 取出每个标题,正文,阅读的连接
42     #for link in link_list:
43     #   print link
44     #   write(link)
45 
46 def blogSpider(url, beginPage, endPage):
47     """
48         作用:贴吧爬虫调度器,负责组合处理每个页面的url
49         url : 贴吧url的前部分
50         beginPage : 起始页
51         endPage : 结束页
52     """
53     for page in range(beginPage, endPage + 1):
54         pn = page
55         fullurl = url + str(pn)
56         #print fullurl
57         loadPage(fullurl)
58         #print html
59 
60         print("谢谢使用")
61 
62 if __name__ == "__main__":
63     beginPage = int(input("请输入起始页:"))
64     endPage = int(input("请输入结束页:"))
65 
66     url = "http://www.cnblogs.com/wanglinjie/default.html?page="
67     blogSpider(url, beginPage, endPage)

 

posted @ 2018-06-17 23:22  王琳杰  阅读(165)  评论(0)    收藏  举报