python爬虫笔记(四)使用基本的XPATH获取特定信息

这部分使用基本的XPATH获取页面特定信息

import requests
from lxml import etree
import urllib2
import urllib

class Tieba_spider(object):

    def __init__(self):
      self.tieba_name = raw_input('please the input the tieba: ')
      self.start_page = int(raw_input('input the start page: '))
      self.end_page = int(raw_input('input the end page: '))

      self.start_url = 'http://tieba.baidu.com/f?'

    def spider(self):
       for page in range(self.start_page, self.end_page+1):
           pn = (page -1) *50
           word = {'pn':pn,'kw':self.tieba_name}
           word =  urllib.urlencode(word)

           url = self.start_url+word
           self.loadpage(url)

    def loadpage(self, url):
        response = requests.get(url).content     
        selector = etree.HTML(response)
        links = selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')       
        print 'yes'
        for link in links:
            print link


tieba_spider = Tieba_spider()
tieba_spider.spider()

完成撒。

posted @ 2017-12-13 15:48  抽象Java  阅读(189)  评论(0)    收藏  举报