功能:
1.列举一个目录下的文件
2.利用BeautifulSoup简单解析正文内容,然后保存
待完善:
1.多线程支持
2.适配器支持(for雷锋网和36氪两个网站网页)
"""
parser
for parsing html file from leiphone.com and 36kr.com
contact xiaoyang
"""
#
# @author: xiaoyang
# @contact: hityixiaoyang@gmail.com
# @version:
# @describ: parse a html file from leiphone.com
# @log:
# 1.2012-11-22 create
# 2.2012-11-23 add FileCollect and ParseTask class
#
import sys
import urllib2
import codecs
import os
from bs4 import BeautifulSoup
# global def
OUT_FILE_PREFIX = "out"
OUT_CNT = 0
#
FileCollectDBG=False
ParseTaskDbg=True
def errPrint(code, msg=''):
print >> sys.stderr, __doc__ % globals()
if msg:
print >> sys.stderr, msg
sys.exit(code)
# for LeiPhone.com
def SaveResLP(doc,filename):
print "!LOCK!"
fp=None
try:
fp=open(filename,"w")
fp.write(doc)
except IOError as errStr:
errPrint(1, errStr)
finally:
fp.close()
print "!UNLOCK!"
return True
# foe 36kr.com
def SaveRes36K(doc,filename):
print "!LOCK!"
print "!UNLOCK!"
return True
class FileCollect:
def __init__(self, root):
self.root = root
self.dlist = []
self.flist = []
def init(self):
for root, dirs, files in os.walk(self.root):
self.dlist += dirs
for afile in files:
self.flist.append(root + afile)
return True
class ParseTask:
def __init__(self, savedFileName):
self.soup = None
self.savedCnt = 0
self.doneCnt = 0
self.savedFileName = savedFileName
def parse(self, readFileName):
fp = None
content = None
try:
fp = open(readFileName, "r")
if fp is not None:
self.soup = BeautifulSoup(fp.read())
else:
errPrint(1, "fopen failed!")
content=self.soup.find_all(id="content_main")
self.doneCnt=self.doneCnt+1
if self.doneCnt >= self.savedCnt:
SaveResLP(str(content[0]),self.savedFileName)
self.doneCnt=0
except IOError as errStr:
errPrint(1, errStr)
finally:
if fp is not None:
fp.close()
if FileCollectDBG:
fc = FileCollect("/opt/project/")
fc.init()
print "dlist:\r\n", fc.dlist
print "flist:\r\n", fc.flist
elif ParseTaskDbg:
newTask=ParseTask("out.html")
newTask.parse("1119-vv-dolby.html")
print "saved OK!\r\n"
浙公网安备 33010602011771号