Python 实现的 Google 批量翻译功能
首先声明,没有什么不良动机,因为经常会用translate.google.cn,就想着用Python模拟网页提交实现文档的批量翻译。据说有API,可是要收费。
生成Token
Google为防爬虫而生成token的代码是Javascript的,且是根据网站的TKK值和提交的文本动态生成。更新规律未知,只好定时去取一下了。
网上能找到的Python代码大部分是去调用PyExecJS库,先不说执行效率的高低(大概是差一个数量级),首先是舍近求远,不纯粹,本人不喜欢。
好不容易找到了一段Python代码还有点小Bug,且缺少动态获取TKK的步骤。最后还是对照Javascript代码自己改成Python了。方法很简单,先转成易懂的Javascript,再转成Python。Javascript代码来自C#实现谷歌翻译API。
原始(晦涩)Javascript代码
varb=function(a,b){ for(vard=0;d>>c:a< c?g[d++]=c:(2048>c?g[d++]=c>>6|192:(55296==(c&64512)&&f+1 >18|240,g[d++]=c>>12&63|128):g[d++]=c>>12|224,g[d++]=c>>6&63|128),g[d++]=c&63|128) } a=h; for(d=0;d a&&(a=(a&2147483647)+2147483648); a%=1E6; returna.toString()+"."+(a^h) }
易懂的Javascript代码
functionRL(a,b){ for(vard=0;d>>c:a< c) { g[d++]=c; } else { if(2048>c) { g[d++]=c>>6|192; } else { if(55296==(c&64512)&&f+1 >18|240; g[d++]=c>>12&63|128; } else { g[d++]=c>>12|224; g[d++]=c>>6&63|128; } } g[d++]=c&63|128; } } a=h; for(vard=0;d a&&(a=(a&2147483647)+2147483648); a%=1E6; returna.toString()+"."+(a^h) }
Python代码
defgetGoogleToken(a,TKK): defRL(a,b): fordinrange(0,len(b)-2,3): c=b[d+2] c=ord(c[0])-87if'a'<=celseint(c) c=a>>cif'+'==b[d+1]elsea<c: g.append(c) else: if2048>c: g.append((c>>6)|192) else: if(55296==(c&64512))and(f+1 >18)|240) g.append((c>>12)&63|128) else: g.append((c>>12)|224) g.append((c>>6)&63|128) g.append((c&63)|128) f+=1 e=TKK.split('.') h=int(e[0])or0 t=h foriteming: t+=item t=RL(t,'+-a^+6') t=RL(t,'+-3^+b+-f') t^=int(e[1])or0 if0>t: t=(t&2147483647)+2147483648 result=t%1000000 returnstr(result)+'.'+str(result^h)
获取TokenKey
Google的TKK可以通过访问网站https://translate.google.cn获取,里面有段脚本里包含了“tkk:('xxxxxx.xxxxxx')”,用正则表达式截取即可。
res=requests.get('https://translate.google.cn',timeout=3) res.raise_for_status() result=re.search(r'tkk\:\'(\d+\.\d+)?\'',res.text).group(1)
划分文章段落
因为常从PDF里复制文本翻译,这样就不能依赖换行符来划分段落了。只能判断空行,作为段落的分界。
另外Google返回的结果Json里,会以英文句点作为分隔符,每一句译文均作为数组的一项分开。所以最后得合并一下,成为一个段落。
完整代码
代码不长,全文黏贴如下。
GoogleTranslator.py: importrequests importre importjson importtime classGoogleTranslator(): _host='translate.google.cn' _headers={ 'Host':_host, 'User-Agent':'Mozilla/5.0(Linux;Android6.0;Nexus5Build/MRA58N)AppleWebKit/537.36(KHTML,likeGecko)Chrome/76.0.3809.100MobileSafari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding':'gzip,deflate,br', 'Content-Type':'application/x-www-form-urlencoded;charset=utf-8', 'Referer':'https://'+_host, 'Connection':'keep-alive', 'Cache-Control':'max-age=0' } _language={ 'afrikaans':'af', 'arabic':'ar', 'belarusian':'be', 'bulgarian':'bg', 'catalan':'ca', 'czech':'cs', 'welsh':'cy', 'danish':'da', 'german':'de', 'greek':'el', 'english':'en', 'esperanto':'eo', 'spanish':'es', 'estonian':'et', 'persian':'fa', 'finnish':'fi', 'french':'fr', 'irish':'ga', 'galician':'gl', 'hindi':'hi', 'croatian':'hr', 'hungarian':'hu', 'indonesian':'id', 'icelandic':'is', 'italian':'it', 'hebrew':'iw', 'japanese':'ja', 'korean':'ko', 'latin':'la', 'lithuanian':'lt', 'latvian':'lv', 'macedonian':'mk', 'malay':'ms', 'maltese':'mt', 'dutch':'nl', 'norwegian':'no', 'polish':'pl', 'portuguese':'pt', 'romanian':'ro', 'russian':'ru', 'slovak':'sk', 'slovenian':'sl', 'albanian':'sq', 'serbian':'sr', 'swedish':'sv', 'swahili':'sw', 'thai':'th', 'filipino':'tl', 'turkish':'tr', 'ukrainian':'uk', 'vietnamese':'vi', 'yiddish':'yi', 'chinese_simplified':'zh-CN', 'chinese_traditional':'zh-TW', 'auto':'auto' } _url='https://'+_host+'/translate_a/single' _params={ 'client':'webapp', 'sl':'en', 'tl':'zh-CN', 'hl':'zh-CN', 'dt':'at', 'dt':'bd', 'dt':'ex', 'dt':'ld', 'dt':'md', 'dt':'qca', 'dt':'rw', 'dt':'rm', 'dt':'ss', 'dt':'t', 'otf':'1', 'ssel':'0', 'tsel':'0', 'kc':'1' } __cookies=None __googleTokenKey='376032.257956' __googleTokenKeyUpdataTime=600.0 __googleTokenKeyRetireTime=time.time()+600.0 def__init__(self,src='en',dest='zh-CN',tkkUpdataTime=600.0): ifsrcnotinself._languageandsrcnotinself._language.values(): src='auto' ifdestnotinself._languageanddestnotinself._language.values(): dest='auto' self._params['sl']=src self._params['tl']=dest self.googleTokenKeyUpdataTime=tkkUpdataTime self.__updateGoogleTokenKey() def__updateGoogleTokenKey(self): self.__googleTokenKey=self.__getGoogleTokenKey() self.__googleTokenKeyRetireTime=time.time()+self.__googleTokenKeyUpdataTime def__getGoogleTokenKey(self): """GettheGoogleTKKfromhttps://translate.google.cn""" #TKKexample:'435075.3634891900' result='' try: res=requests.get('https://'+self._host,timeout=3) res.raise_for_status() self.__cookies=res.cookies result=re.search(r'tkk\:\'(\d+\.\d+)?\'',res.text).group(1) exceptrequests.exceptions.ReadTimeoutasex: print('ERROR:'+str(ex)) time.sleep(1) returnresult def__getGoogleToken(self,a,TKK): """CalculateGoogletkfromTKK""" #https://www.cnblogs.com/chicsky/p/7443830.html #iftext='TabletDeveloper'andTKK='435102.3120524463',thentk='315066.159012' defRL(a,b): fordinrange(0,len(b)-2,3): c=b[d+2] c=ord(c[0])-87if'a'<=celseint(c) c=a>>cif'+'==b[d+1]elsea<c: g.append(c) else: if2048>c: g.append((c>>6)|192) else: if(55296==(c&64512))and(f+1 >18)|240) g.append((c>>12)&63|128) else: g.append((c>>12)|224) g.append((c>>6)&63|128) g.append((c&63)|128) f+=1 e=TKK.split('.') h=int(e[0])or0 t=h foriteming: t+=item t=RL(t,'+-a^+6') t=RL(t,'+-3^+b+-f') t^=int(e[1])or0 if0>t: t=(t&2147483647)+2147483648 result=t%1000000 returnstr(result)+'.'+str(result^h) deftranslate(self,text): iftime.time()>self.__googleTokenKeyRetireTime: self.__updateGoogleTokenKey() data={'q':text} self._params['tk']=self.__getGoogleToken(text,self.__googleTokenKey) result='' try: res=requests.post(self._url, headers=self._headers, cookies=self.__cookies, data=data, params=self._params, timeout=6) res.raise_for_status() jsonText=res.text iflen(jsonText)>0: jsonResult=json.loads(jsonText) iflen(jsonResult[0])>0: foriteminjsonResult[0]: result+=item[0] returnresult exceptExceptionasex: print('ERROR:'+str(ex)) return'' importtime fromGoogleTranslatorimportGoogleTranslator defreadFile(fileName): withopen(fileName,'r')asf: paragraph='' forlineinf: ifline[0]!='\n': paragraph+=line.strip('\n') else: iflen(paragraph)>0: yieldparagraph paragraph='' iflen(paragraph)>0: yieldparagraph main.py: defmain(): translator=GoogleTranslator() count=0 withopen('C:\\dx\\python\\d.txt','w',encoding='utf-8')asdf: forlineinreadFile('C:\\dx\\python\\s.txt'): iflen(line)>1: count+=1 print('\r'+str(count),end='',flush=True) df.write(line.strip()+"\n") result=translator.translate(line) df.write(result.strip()+"\n\n") if__name__=="__main__": startTime=time.time() main() print() print('%.2fseconds'%(time.time()-startTime))
总结
以上所述是小编给大家介绍的Python实现的Google批量翻译功能,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对毛票票网站的支持!
如果你觉得本文对你有帮助,欢迎转载,烦请注明出处,谢谢!