urllib2抓取HTML存入Excel

通过urllib2抓取HTML网页，然后过滤出包含特定字符的行，并写入Excel文件：

# -*- coding: utf-8 -*-

import sys
#import urllib
import urllib2

from xlwt import Workbook

def getdata(keywords, line):
    date = ''
    if keywords in line: # 本行包含keywords
        start = line.find('>',)
        end = line.find('</', start)
        data = line[start+1:end]
        return data
    return False

def FetchDataByUrllib(checkUrl):
    book=Workbook(encoding='gbk')
    # add_sheet新增sheet，默认不能overwrite数据，必须显示指定可更改。
    sheet=book.add_sheet('mySheet', cell_overwrite_ok=True)
        
    try:
        checkFile = urllib2.urlopen(checkUrl)
    except Exception, e:
    
        print e
        return

    type = sys.getfilesystemencoding()
    
    i = 1
    for line in checkFile:
        # 根据网页编码格式来解码
        line = line.decode("UTF-8").encode(type)
        #line = line.decode("GBK").encode(type)

        # 逐行全部写入excel文件。
        #sheet.write(i,1,line)
        #i+=1
        
        # 查找所需的特定数据，写入Excel文件。
        targetStr = getdata('体育', line) # 包含'体育'的行
        if targetStr != False:
            sheet.write(i,1,targetStr)
            i+=1
            
    book.save('simple.xls')
    print 'finish!'

print '开始...'

myUrl = 'http://www.sina.com.cn'

FetchDataByUrllib(myUrl)

输出结果：

posted on 2016-09-17 16:17 ruanchao 阅读(188) 评论(0) 收藏举报

刷新页面返回顶部

ruanchao

公告