python爬虫笔记（三）使用基本正则表达式抓取特定信息

这次我写一个用正则表达式提取页面特定部分的爬虫

import requests
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class Spider(object):

    def loadpage(self, page):
        url = 'http://www.zaojuzi.com/nirenju/17554_{}.html'.format(page)
        response = requests.get(url)
        html = response.content.decode('utf-8')
        #通过正则表达式提取HTML网页里的<li></li>之间的内容
        pattern = re.compile(r'<li>(.*?)</li>',re.S)
        items = pattern.findall(html)
        return items
    
    #保存信息
    def save_data(self, text):
        with open('duanzi.txt','a') as f:
            f.write(text)
            f.write('==============')    

    def printpage(self, items, page):
        print u'----- NO.{} Page ------'.format(page)
        for item in items:
            print '-----'
           
            #剔除获取信息中没用的部分
            item = item.replace('<a>','').replace('</a>','').replace('<a href="','').replace('.html">','')
            item = re.sub(r'/\w+/','',item)#\w match all the zimu
            item = re.sub(r'\d','',item)
            print item
            self.save_data(item)


    def main(self, page):
        items = self.loadpage(page)
        self.printpage(items,page)

spider = Spider()
spider.main(2)

完成。

posted @ 2017-12-13 15:46 抽象Java 阅读(189) 评论(0) 收藏举报

Jomini

python爬虫笔记（三）使用基本正则表达式抓取特定信息

公告