python入门（2）-目录文件列举和Beautiful Soup简单解析

Posted on 2012-11-23 00:12 晓彻阅读(356) 评论(0) 收藏举报

功能：

　　1.列举一个目录下的文件

　　2.利用BeautifulSoup简单解析正文内容，然后保存

待完善：

　　1.多线程支持

　　2.适配器支持（for雷锋网和36氪两个网站网页）

"""
parser
    for parsing html file from leiphone.com and 36kr.com
    contact xiaoyang
"""

#
# @author:  xiaoyang
# @contact: hityixiaoyang@gmail.com
# @version:
# @describ: parse a html file from leiphone.com
# @log:
#           1.2012-11-22 create
#           2.2012-11-23 add FileCollect and ParseTask class
#

import sys
import urllib2
import codecs
import os
from bs4 import BeautifulSoup

# global def
OUT_FILE_PREFIX = "out"
OUT_CNT = 0

#
FileCollectDBG=False
ParseTaskDbg=True

def errPrint(code, msg=''):
    print >> sys.stderr, __doc__ % globals()
    if msg:
        print >> sys.stderr, msg
        sys.exit(code)

# for LeiPhone.com
def SaveResLP(doc,filename):
    print "!LOCK!"
    fp=None
    try:
        fp=open(filename,"w")
        fp.write(doc)
    except IOError as errStr:
        errPrint(1, errStr)
    finally:
        fp.close()
        print "!UNLOCK!"
    return True

# foe 36kr.com
def SaveRes36K(doc,filename):
   print "!LOCK!"
   print "!UNLOCK!"
   return True

class FileCollect:
	def __init__(self, root):
		self.root = root
		self.dlist = []
		self.flist = []
	def init(self):
		for root, dirs, files in os.walk(self.root):
			self.dlist += dirs
			for afile in files: 
				self.flist.append(root + afile)
		return True

class ParseTask:
    def __init__(self, savedFileName):
		self.soup = None
		self.savedCnt = 0
		self.doneCnt = 0
		self.savedFileName = savedFileName
    def parse(self, readFileName):
        fp = None
        content = None
        try:
            fp = open(readFileName, "r")
            if fp is not None:
                self.soup = BeautifulSoup(fp.read())
            else:
                errPrint(1, "fopen failed!")
            content=self.soup.find_all(id="content_main")
            self.doneCnt=self.doneCnt+1
           
            if self.doneCnt >= self.savedCnt:
                SaveResLP(str(content[0]),self.savedFileName)
                self.doneCnt=0
        except IOError as errStr:
			errPrint(1, errStr)
        finally:
			if fp is not None:
				fp.close()

if FileCollectDBG:		
    fc = FileCollect("/opt/project/")
    fc.init()
    print "dlist:\r\n", fc.dlist
    print "flist:\r\n", fc.flist
elif ParseTaskDbg:
    newTask=ParseTask("out.html")
    newTask.parse("1119-vv-dolby.html")
    print "saved OK!\r\n"

刷新页面返回顶部

暴走的指压师

公告

python入门（2）-目录文件列举和Beautiful Soup简单解析