[python] Model 模块把从文件里读取一行英文后就调用对应的各种handler 翻译出中文,并回写到对应的文件中

#!/usr/bin/python
# -*- coding: utf-8 -*-

from translate import Translator
import pdb,re,time,json,glob
import sys,requests
import execjs

reload(sys)
sys.setdefaultencoding('utf8')


#http://www.itdadao.com/articles/c15a852432p0.html
#get_tk

class Py4Js():

    def __init__(self):
        self.ctx = execjs.compile("""
        function TL(a) {
        var k = "";
        var b = 406644;
        var b1 = 3293161072;

        var jd = ".";
        var $b = "+-a^+6";
        var Zb = "+-3^+b+-f";
    
        for (var e = [], f = 0, g = 0; g < a.length; g++) {
            var m = a.charCodeAt(g);
            128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
            e[f++] = m >> 18 | 240,
            e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
            e[f++] = m >> 6 & 63 | 128),
            e[f++] = m & 63 | 128)
        }
        a = b;
        for (f = 0; f < e.length; f++) a += e[f],
        a = RL(a, $b);
        a = RL(a, Zb);
        a ^= b1 || 0;
        0 > a && (a = (a & 2147483647) + 2147483648);
        a %= 1E6;
        return a.toString() + jd + (a ^ b)
    };
    
    function RL(a, b) {
        var t = "a";
        var Yb = "+";
        for (var c = 0; c < b.length - 2; c += 3) {
            var d = b.charAt(c + 2),
            d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
            d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
            a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
        }
        return a
    }
    """)
        
    def getTk(self,text):
        return self.ctx.call("TL",text)



# ***** ***** ***** ***** ***** ***** ***** ***** *****
# 有三个handler每个handler处理一种翻译
#
#
#
def translate_hanlder_Google(strSrc):

    #install google translate first : pip install translate  https://github.com/terryyin/google-translate-python
    listTmp = []
    translator= Translator(to_lang="zh")
    listTmp.append(translator.translate(strSrc))
    yield listTmp

def translate_hanlder_Google_web(strSrc):


    URL_GOOGLE = "http://translate.google.cn/translate_a/single"

    js = Py4Js()

    tk_1 = js.getTk(strSrc)


    query_str = dict(q = strSrc,
                 tk = tk_1,
                 kc = 1,
                 tsel = 1,
                 ssel = 0,
                 otf = 1,
                 pc = 1,
                 oe = "UTF-8",
                 ie = "UTF-8",
                 dt = "t",
                 hl = "zh-CN",
                 sl = "en",
                 client = "t"
                 )


    headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}

    listTmp =[]

    try:

        response = requests.get(URL_GOOGLE, params=query_str,headers = headers)

        listTmp.append(response_string_to_list(response.text))
        yield listTmp 

    except Exception,e:
        listTmp.append("")
        yield listTmp
        print Exception,":",e


def response_string_to_list(hyp):
    content = re.search('(?<=\[\[\[).+?(?=\]\])',hyp).group()
    hyp = "[" + content +"]" 

    listEnStr = []
    listCnStr = []
    listTmp = []

    while True:
        try:
            content = re.search('(?<=\[).+?(?=\])',hyp).group()
        except: 
            return "".join(listCnStr)

        listTmp = content.split('\",\"')

        if len(listTmp) < 2 :
            break

        listCnStr.append(listTmp[0][1:])
        listEnStr.append(listTmp[1])

        hyp = hyp.replace("[" + content +"]","") 
    
    return "".listCnStr()




def translate_hanlder_ailib(srcStr):

    URL_AILIB = "http://www.oa.com/translate/get_result/"
    query_str = dict(source_text = srcStr,
                     form_lan_id = -1,
                     target_lang_id = 1,
                     model = "nmt"
                     )
    listTmp =[]

    try:
        response = requests.get(URL_AILIB, params=query_str)
        response_obj = json.loads(response.text)
        hyp = response_obj[u'content']
        listTmp.append(hyp)
        yield listTmp

    except Exception,e:
        listTmp.append("")
        yield listTmp
        print Exception,":",e

def translate_hanlder_MIG(srcStr):
    URL_FANYIJUN = "http://www.oa.com/api/translate"
    
    request_body ="sl=1&tl=0&st=" + srcStr + "&translator=1"
    headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}
    listTmp = []

    try: 
        response = requests.post(URL_FANYIJUN, data = request_body,headers = headers)
        print "#" * 10 + response.text
        response_obj = json.loads(response.text)
        hyp = response_obj[u'result'][0][u'dst']
        listTmp.append(hyp)
        yield listTmp

    except Exception,e:
        listTmp.append("")
        yield listTmp
        print Exception,":",e
# 
# 
# ***** ***** ***** ***** ***** ***** ***** ***** ****






# ***** ***** ***** ***** ***** ***** ***** ***** *****
# 功能函数 Begin 
#
#
def get_src_from_link_request(link):

    response = requests.get(link)

    #page title 
    contentStr = re.search('(?<=h1 class="ph">).*?(?=</h1)',response.text.replace("\r\n","")).group()
    contentStr.replace("<strong>","")
    contentStr.replace("</strong>","")
    contentList = contentStr.split("<br>") 

    yield [link,contentList[1],contentList[0]]


    contentStrEn = re.search('(?<=div id="en">).*?(?=</div)',response.text.replace("\r\n","")).group()
    contentStrEn = contentStrEn.replace("<strong>","")
    contentStrEn = contentStrEn.replace("<b>","")
    contentStrEn = contentStrEn.replace("\r","")
    contentStrEn = contentStrEn.replace("\n","")
    contentStrEn = contentStrEn.replace("</b>","")
    contentStrEn = contentStrEn.replace("</strong>","")
    contentListEn = contentStrEn.split("<br>") 

    contentStrCn = re.search('(?<=div id="cn">).*?(?=</div)',response.text.replace("\r\n","")).group()
    contentStrCn = contentStrCn.replace("<strong>","")
    contentStrCn = contentStrCn.replace("</strong>","")
    contentStrCn = contentStrCn.replace("<b>","")
    contentStrCn = contentStrCn.replace("\r","")
    contentStrCn = contentStrCn.replace("\n","")
    contentStrCn = contentStrCn.replace("</b>","")
    contentListCn = contentStrCn.split("<br>") 


    i = 0 
    listTmp = []
    while i < len(contentListEn): 
        if len(contentListEn[i]) > 0 :
            listTmp.append(link)
            listTmp.append(contentListEn[i].strip())
            if len(contentListCn) > i and len(contentListCn[i]) >0  :
                listTmp.append(contentListCn[i].strip())
            print listTmp
            yield listTmp 
        listTmp = []
        i = i + 1 

    time.sleep(0.5)

def get_src_from_link(linkList):
    tmpFileName  = time.strftime('linkSrc:%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))
    count = 0 
    for link in linkList: 
        for contentList in get_src_from_link_request(link):
            f = open(tmpFileName , 'a')
            f.write(str(count)+ '\t')
            f.write('\t'.join(contentList))
            f.write('\n')
            count = count+1 

    return tmpFileName

def write_result_to_file(count,toFileName,strList):
    f = open(toFileName , 'a')
    f.write(str(count)+ '\t')
    f.write('\t'.join(strList))
    f.write('\n')


def read_english_from_file(fileName):

    fp = open(fileName,"r")
    for line in fp.readlines():
        line = line.strip()
        list_tmp = line.split('\t')
        yield list_tmp 
# 功能函数 end 
# ***** ***** ***** ***** ***** ***** ***** ***** ****






# ***** ***** ***** ***** ***** ***** ***** ***** *****
# Model 模块把从文件里读取一行英文后就调用对应的各种handler 翻译出中文,并回写到对应的文件中
#
#
#
def translate_module(srcFileName):

    resultFileNameMIG  ="result_mig_" +  time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))
    resultFileNameAilib  ="result_ailib_" +  time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))
    resultFileNameGoogle  ="result_google_" +  time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))

    count = 0

    for srcStringList in read_english_from_file(srcFileName):
        #translate the english string  and write to file
        #for listTmp in translate_hanlder_MIG(srcStringList[2]):
        #    write_result_to_file(count,resultFileNameMIG,listTmp)

        #todo : wenle
        #for listTmp in translate_hanlder_ailib(srcStringList[2]):
        #    write_result_to_file(count,resultFileNameAibli,listTmp)

        #for listTmp in translate_hanlder_Google(srcStringList[2]):
        #    write_result_to_file(count,resultFileNameGoogle,listTmp)

        for listTmp in translate_hanlder_Google_web(srcStringList[2]):
            write_result_to_file(count,resultFileNameGoogle,listTmp)

        count = count + 1

# 
# ***** ***** ***** ***** ***** ***** ***** ***** ****



def run_tanslate(srcFile):
    translate_module(srcFile)



def translate_manger():

    lifeLink =[
    "http://www.cuyoo.com/article-30054-1.html",
    "http://www.cuyoo.com/article-20071-1.html",
    "http://www.cuyoo.com/article-19279-1.html",
    "http://www.cuyoo.com/article-17735-1.html",
    "http://www.cuyoo.com/article-15527-1.html",
    "http://www.cuyoo.com/article-19242-1.html",
    "http://www.cuyoo.com/article-18944-1.html",
    "http://www.cuyoo.com/article-25511-1.html",
    # "http://www.cuyoo.com/article-24928-1.html",  此文章本身有排版错误
    "http://www.cuyoo.com/article-23211-1.html",
    "http://www.cuyoo.com/article-21211-1.html",
    "http://www.cuyoo.com/article-20726-1.html",
    "http://www.cuyoo.com/article-19734-1.html",
    "http://www.cuyoo.com/article-18207-1.html"
    ]
    scenLink = [
    "http://www.cuyoo.com/article-32579-1.html",
    "http://www.cuyoo.com/article-32610-1.html",
    "http://www.cuyoo.com/article-32594-1.html",
    "http://www.cuyoo.com/article-32587-1.html",
    "http://www.cuyoo.com/article-32522-1.html",
    "http://www.cuyoo.com/article-32519-1.html",
    "http://www.cuyoo.com/article-32469-1.html",
    "http://www.cuyoo.com/article-32328-1.html",
    "http://www.cuyoo.com/article-32303-1.html",
    "http://www.cuyoo.com/article-32184-1.html",
    "http://www.cuyoo.com/article-32106-1.html",
    "http://www.cuyoo.com/article-32082-1.html",
    "http://www.cuyoo.com/article-31902-1.html",
    "http://www.cuyoo.com/article-31885-1.html",
    "http://www.cuyoo.com/article-31673-1.html",
    "http://www.cuyoo.com/article-31554-1.html"
    ] 


    urlList = []
    #urlList.extend(scenLink)
    #urlList.extend(lifeLink) 
    #urlList = ["http://www.cuyoo.com/article-24928-1.html"]
    tmpFileName = get_src_from_link(urlList)
    tmpFileName = "linkSrc:2017-04-03_17:56:00" 
    run_tanslate(tmpFileName)


def main():
    translate_manger()

if __name__ == "__main__":
    main()

  

posted @ 2017-04-03 21:31  WenLe  阅读(432)  评论(0编辑  收藏  举报