python实例

# encoding:utf-8
'''
Created on 2014年7月14日

@author: caoshouxin
'''
import os
import re
import os.path
from lxml import etree
from sogou import offdb,docid
import traceback
import struct
import logging as L
from time import localtime,strftime
L.basicConfig(level=L.INFO, format='[%(asctime)s] %(levelname)-8s %(message)s')
    
filename="baike_soso_upload_20140717-160704.26523.xml"
print "文件操作"+os.getcwd()

class sosobaikeProcess():
    def __init__(self,filename,ip="127.0.0.1",port="9999"):
        url_beg="http://baike.sogou.com/v"
        url_end=".htm\n"
        self.file_name=filename
        self.offdb_rand=offdb.QuickAdapter()
        self.offdb_rand.open(ip,port,5)
        now_time=strftime("%Y-%m-%d",localtime())
        dir=""
        result_tup=self.getlemmaId_type()
        if result_tup is not None:
            (lemmaId,baike_type,value)=result_tup
            outputFile="sosobaike_"+now_time+"_"+baike_type
            outf=open(outputFile,'a')
            outf.write(url_beg+url_end)
            outf.close()
            self.put_qdb(lemmaId, value)
    def put_qdb(self,lemmaId,value):
        try:
            key=struct.pack('i',int(lemmaId))
            ret=self.offdb_rand.put(key,value,0,5)
            if ret==0 or ret==1:
                L.info("put file %s/%s success %d"%(self.file_name,lemmaId,1))
            else:
                self.offdb_reconnect(5,3)
        except Exception,e:
            L.error("put file %s/%s err %d because:%s"%(self.file_name,lemmaId,1,traceback.format_exc())) 
        self.offdb_rand.close()
    def getlemmaId_type(self):
        lemmaId_obj=re.compile("<lemmaId>(.*?)</lemmaId>.*?<action>(.*?)</action>",re.M)
        lemma_obj=re.compile("<lemmaId>(.*?)</lemmaId>",re.M)
        lemmaId=""
        baike_type=""
        if os.path.isfile(self.file_name):
            f=open(self.file_name)
            #为节约内存和提高匹配速度,只读取文件的1024字节
            filecontent=f.read(1024)
            f.close()
            m=re.search(lemmaId_obj, filecontent)
            if m is not None:
                lemmaId=m.group(1)
                baike_type=m.group(2)
                L.info("put file %s lemmaId=%s, type=%s "%(self.file_name,lemmaId,baike_type))
                return (lemmaId,baike_type)
            else:
                m_1=re.search(lemma_obj,filecontent)
                if m_1 is not None:
                    lemmaId=m.group(1)
                    baike_type="update"
                    L.info("put file %s lemmaId=%s, type=%s "%(self.file_name,lemmaId,baike_type))
                    return (lemmaId,baike_type)
                else:
                    L.info("put file %s not found result"%(self.file_name))
                    print None
        else:
            L.info("put file%s not found"%(self.file_name))
            return None
posted @ 2014-07-24 15:09  曹守鑫  阅读(342)  评论(0编辑  收藏  举报