[python] Model 模块把从文件里读取一行英文后就调用对应的各种handler 翻译出中文,并回写到对应的文件中
#!/usr/bin/python # -*- coding: utf-8 -*- from translate import Translator import pdb,re,time,json,glob import sys,requests import execjs reload(sys) sys.setdefaultencoding('utf8') #http://www.itdadao.com/articles/c15a852432p0.html #get_tk class Py4Js(): def __init__(self): self.ctx = execjs.compile(""" function TL(a) { var k = ""; var b = 406644; var b1 = 3293161072; var jd = "."; var $b = "+-a^+6"; var Zb = "+-3^+b+-f"; for (var e = [], f = 0, g = 0; g < a.length; g++) { var m = a.charCodeAt(g); 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023), e[f++] = m >> 18 | 240, e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224, e[f++] = m >> 6 & 63 | 128), e[f++] = m & 63 | 128) } a = b; for (f = 0; f < e.length; f++) a += e[f], a = RL(a, $b); a = RL(a, Zb); a ^= b1 || 0; 0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + jd + (a ^ b) }; function RL(a, b) { var t = "a"; var Yb = "+"; for (var c = 0; c < b.length - 2; c += 3) { var d = b.charAt(c + 2), d = d >= t ? d.charCodeAt(0) - 87 : Number(d), d = b.charAt(c + 1) == Yb ? a >>> d: a << d; a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d } return a } """) def getTk(self,text): return self.ctx.call("TL",text) # ***** ***** ***** ***** ***** ***** ***** ***** ***** # 有三个handler每个handler处理一种翻译 # # # def translate_hanlder_Google(strSrc): #install google translate first : pip install translate https://github.com/terryyin/google-translate-python listTmp = [] translator= Translator(to_lang="zh") listTmp.append(translator.translate(strSrc)) yield listTmp def translate_hanlder_Google_web(strSrc): URL_GOOGLE = "http://translate.google.cn/translate_a/single" js = Py4Js() tk_1 = js.getTk(strSrc) query_str = dict(q = strSrc, tk = tk_1, kc = 1, tsel = 1, ssel = 0, otf = 1, pc = 1, oe = "UTF-8", ie = "UTF-8", dt = "t", hl = "zh-CN", sl = "en", client = "t" ) headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'} listTmp =[] try: response = requests.get(URL_GOOGLE, params=query_str,headers = headers) listTmp.append(response_string_to_list(response.text)) yield listTmp except Exception,e: listTmp.append("") yield listTmp print Exception,":",e def response_string_to_list(hyp): content = re.search('(?<=\[\[\[).+?(?=\]\])',hyp).group() hyp = "[" + content +"]" listEnStr = [] listCnStr = [] listTmp = [] while True: try: content = re.search('(?<=\[).+?(?=\])',hyp).group() except: return "".join(listCnStr) listTmp = content.split('\",\"') if len(listTmp) < 2 : break listCnStr.append(listTmp[0][1:]) listEnStr.append(listTmp[1]) hyp = hyp.replace("[" + content +"]","") return "".listCnStr() def translate_hanlder_ailib(srcStr): URL_AILIB = "http://www.oa.com/translate/get_result/" query_str = dict(source_text = srcStr, form_lan_id = -1, target_lang_id = 1, model = "nmt" ) listTmp =[] try: response = requests.get(URL_AILIB, params=query_str) response_obj = json.loads(response.text) hyp = response_obj[u'content'] listTmp.append(hyp) yield listTmp except Exception,e: listTmp.append("") yield listTmp print Exception,":",e def translate_hanlder_MIG(srcStr): URL_FANYIJUN = "http://www.oa.com/api/translate" request_body ="sl=1&tl=0&st=" + srcStr + "&translator=1" headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'} listTmp = [] try: response = requests.post(URL_FANYIJUN, data = request_body,headers = headers) print "#" * 10 + response.text response_obj = json.loads(response.text) hyp = response_obj[u'result'][0][u'dst'] listTmp.append(hyp) yield listTmp except Exception,e: listTmp.append("") yield listTmp print Exception,":",e # # # ***** ***** ***** ***** ***** ***** ***** ***** **** # ***** ***** ***** ***** ***** ***** ***** ***** ***** # 功能函数 Begin # # def get_src_from_link_request(link): response = requests.get(link) #page title contentStr = re.search('(?<=h1 class="ph">).*?(?=</h1)',response.text.replace("\r\n","")).group() contentStr.replace("<strong>","") contentStr.replace("</strong>","") contentList = contentStr.split("<br>") yield [link,contentList[1],contentList[0]] contentStrEn = re.search('(?<=div id="en">).*?(?=</div)',response.text.replace("\r\n","")).group() contentStrEn = contentStrEn.replace("<strong>","") contentStrEn = contentStrEn.replace("<b>","") contentStrEn = contentStrEn.replace("\r","") contentStrEn = contentStrEn.replace("\n","") contentStrEn = contentStrEn.replace("</b>","") contentStrEn = contentStrEn.replace("</strong>","") contentListEn = contentStrEn.split("<br>") contentStrCn = re.search('(?<=div id="cn">).*?(?=</div)',response.text.replace("\r\n","")).group() contentStrCn = contentStrCn.replace("<strong>","") contentStrCn = contentStrCn.replace("</strong>","") contentStrCn = contentStrCn.replace("<b>","") contentStrCn = contentStrCn.replace("\r","") contentStrCn = contentStrCn.replace("\n","") contentStrCn = contentStrCn.replace("</b>","") contentListCn = contentStrCn.split("<br>") i = 0 listTmp = [] while i < len(contentListEn): if len(contentListEn[i]) > 0 : listTmp.append(link) listTmp.append(contentListEn[i].strip()) if len(contentListCn) > i and len(contentListCn[i]) >0 : listTmp.append(contentListCn[i].strip()) print listTmp yield listTmp listTmp = [] i = i + 1 time.sleep(0.5) def get_src_from_link(linkList): tmpFileName = time.strftime('linkSrc:%Y-%m-%d_%H:%M:%S', time.localtime(time.time())) count = 0 for link in linkList: for contentList in get_src_from_link_request(link): f = open(tmpFileName , 'a') f.write(str(count)+ '\t') f.write('\t'.join(contentList)) f.write('\n') count = count+1 return tmpFileName def write_result_to_file(count,toFileName,strList): f = open(toFileName , 'a') f.write(str(count)+ '\t') f.write('\t'.join(strList)) f.write('\n') def read_english_from_file(fileName): fp = open(fileName,"r") for line in fp.readlines(): line = line.strip() list_tmp = line.split('\t') yield list_tmp # 功能函数 end # ***** ***** ***** ***** ***** ***** ***** ***** **** # ***** ***** ***** ***** ***** ***** ***** ***** ***** # Model 模块把从文件里读取一行英文后就调用对应的各种handler 翻译出中文,并回写到对应的文件中 # # # def translate_module(srcFileName): resultFileNameMIG ="result_mig_" + time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time())) resultFileNameAilib ="result_ailib_" + time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time())) resultFileNameGoogle ="result_google_" + time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time())) count = 0 for srcStringList in read_english_from_file(srcFileName): #translate the english string and write to file #for listTmp in translate_hanlder_MIG(srcStringList[2]): # write_result_to_file(count,resultFileNameMIG,listTmp) #todo : wenle #for listTmp in translate_hanlder_ailib(srcStringList[2]): # write_result_to_file(count,resultFileNameAibli,listTmp) #for listTmp in translate_hanlder_Google(srcStringList[2]): # write_result_to_file(count,resultFileNameGoogle,listTmp) for listTmp in translate_hanlder_Google_web(srcStringList[2]): write_result_to_file(count,resultFileNameGoogle,listTmp) count = count + 1 # # ***** ***** ***** ***** ***** ***** ***** ***** **** def run_tanslate(srcFile): translate_module(srcFile) def translate_manger(): lifeLink =[ "http://www.cuyoo.com/article-30054-1.html", "http://www.cuyoo.com/article-20071-1.html", "http://www.cuyoo.com/article-19279-1.html", "http://www.cuyoo.com/article-17735-1.html", "http://www.cuyoo.com/article-15527-1.html", "http://www.cuyoo.com/article-19242-1.html", "http://www.cuyoo.com/article-18944-1.html", "http://www.cuyoo.com/article-25511-1.html", # "http://www.cuyoo.com/article-24928-1.html", 此文章本身有排版错误 "http://www.cuyoo.com/article-23211-1.html", "http://www.cuyoo.com/article-21211-1.html", "http://www.cuyoo.com/article-20726-1.html", "http://www.cuyoo.com/article-19734-1.html", "http://www.cuyoo.com/article-18207-1.html" ] scenLink = [ "http://www.cuyoo.com/article-32579-1.html", "http://www.cuyoo.com/article-32610-1.html", "http://www.cuyoo.com/article-32594-1.html", "http://www.cuyoo.com/article-32587-1.html", "http://www.cuyoo.com/article-32522-1.html", "http://www.cuyoo.com/article-32519-1.html", "http://www.cuyoo.com/article-32469-1.html", "http://www.cuyoo.com/article-32328-1.html", "http://www.cuyoo.com/article-32303-1.html", "http://www.cuyoo.com/article-32184-1.html", "http://www.cuyoo.com/article-32106-1.html", "http://www.cuyoo.com/article-32082-1.html", "http://www.cuyoo.com/article-31902-1.html", "http://www.cuyoo.com/article-31885-1.html", "http://www.cuyoo.com/article-31673-1.html", "http://www.cuyoo.com/article-31554-1.html" ] urlList = [] #urlList.extend(scenLink) #urlList.extend(lifeLink) #urlList = ["http://www.cuyoo.com/article-24928-1.html"] tmpFileName = get_src_from_link(urlList) tmpFileName = "linkSrc:2017-04-03_17:56:00" run_tanslate(tmpFileName) def main(): translate_manger() if __name__ == "__main__": main()