python爬虫笔记(四)使用基本的XPATH获取特定信息
这部分使用基本的XPATH获取页面特定信息
import requests from lxml import etree import urllib2 import urllib class Tieba_spider(object): def __init__(self): self.tieba_name = raw_input('please the input the tieba: ') self.start_page = int(raw_input('input the start page: ')) self.end_page = int(raw_input('input the end page: ')) self.start_url = 'http://tieba.baidu.com/f?' def spider(self): for page in range(self.start_page, self.end_page+1): pn = (page -1) *50 word = {'pn':pn,'kw':self.tieba_name} word = urllib.urlencode(word) url = self.start_url+word self.loadpage(url) def loadpage(self, url): response = requests.get(url).content selector = etree.HTML(response) links = selector.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href') print 'yes' for link in links: print link tieba_spider = Tieba_spider() tieba_spider.spider()
完成撒。

浙公网安备 33010602011771号