python自动从arxiv下载paper的示例代码
#!/usr/bin/envpython #-*-coding:utf-8-*- #@Time:2020/02/1121:44 #@Author:dangxusheng #@Email:dangxusheng163@163.com #@File:download_by_href.py ''' 自动从arxiv.org下载文献 ''' importos importos.pathasosp importrequests fromlxmlimportetree frompprintimportpprint importre importtime importglob headers={ "User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/80.0.3987.87Safari/537.36", "Host":'arxiv.org' } HREF_CN='http://cn.arxiv.org/pdf/' HREF_SRC='http://cn.arxiv.org/pdf/' SAVE_PATH='/media/dangxs/E/Paper/download_at_20200730' os.makedirs(SAVE_PATH,exist_ok=True) FAIL_URLS=[] FAIL_URLS_TXT=f'{SAVE_PATH}/fail_urls.txt' defdownload(url,title): pattern=r'[\\/:*?"\'<>|\r\n]+' new_title=re.sub(pattern,"",title) print(f'newtitle:{new_title}') save_filepath='%s/%s.pdf'%(SAVE_PATH,new_title) ifosp.exists(save_filepath)andosp.getsize(save_filepath)>50*1024: print(f'thispdfisbeexisted.') returnTrue try: withopen(save_filepath,'wb')asfile: #分字节下载 r=requests.get(url,stream=True,timeout=None) foriinr.iter_content(2048): file.write(i) ifosp.getsize(save_filepath)>=10*1024: print('%s下载成功.'%title) returnTrue exceptExceptionase: print(e) returnFalse #从arxiv.org去下载 defsearch(start_size=0,title_keywords='FacialExpression'): #访问地址:https://arxiv.org/find/grp_eess,grp_stat,grp_cs,grp_econ,grp_math/1/ti:+Face/0/1/0/past,2018,2019/0/1?skip=200&query_id=1c582e6c8afc6146&client_host=cn.arxiv.org req_url='https://arxiv.org/search/advanced' req_data={ 'advanced':1, 'terms-0-operator':'AND', 'terms-0-term':title_keywords, 'terms-0-field':'title', 'classification-computer_science':'y', 'classification-physics_archives':'all', 'classification-include_cross_list':'include', 'date-filter_by':'date_range',#date_range|specific_year #'date-year':DOWN_YEAR, 'date-year':'', 'date-from_date':'2015', 'date-to_date':'2020', 'date-date_type':'announced_date_first',#submitted_date|submitted_date_first|announced_date_first 'abstracts':'show', 'size':50, 'order':'-announced_date_first', 'start':start_size, } res=requests.get(req_url,params=req_data,headers=headers) html=res.content.decode() html=etree.HTML(html) total_text=html.xpath('//h1[@class="titleis-clearfix"]/text()') total_text=''.join(total_text).replace('\n','').lstrip('').strip('') #i.e.:Showing1–50of355results num=re.findall('\d+',total_text) #Sorry,yourqueryreturnednoresults iflen(num)==0:return[],0 total=int(num[-1])#查询总条数 paper_list=html.xpath('//ol[@class="breathe-horizontal"]/li') info_list=[] forpinpaper_list: title=p.xpath('./p[@class="titleis-5mathjax"]//text()') title=''.join(title).replace('\n','').lstrip('').strip('') href=p.xpath('./div/p/a/@href')[0] info_list.append({'title':title,'href':href}) returninfo_list,total #去指定页面下载 defsearch_special(): res=requests.get('https://gitee.com/weberyoung/the-gan-zoo?_from=gitee_search') html=res.content.decode() html=etree.HTML(html) paper_list=html.xpath('//div[@class="file_contentmarkdown-body"]//li') info_list=[] forpinpaper_list: title=p.xpath('.//text()') title=''.join(title).replace('\n','').lstrip('').strip('') href=p.xpath('./a/@href')[0] info_list.append({'title':title,'href':href}) pprint(info_list) returninfo_list if__name__=='__main__': page_idx=0 total=1000 keywords='FacialActionUnit' whilepage_idx<=total//50: paper_list,total=search(page_idx*50,keywords) print(f'total:{total}') iftotal==0: print('nofound.') exit(0) forpinpaper_list: title=p['title'] href=HREF_CN+p['href'].split('/')[-1]+'.pdf' print(href) ifnotdownload(href,title): print('从国内镜像下载失败,从源地址开始下载>>>>') #使用国际URL再下载一次 href=HREF_SRC+p['href'].split('/')[-1]+'.pdf' ifnotdownload(href,title): FAIL_URLS.append(p) page_idx+=1 #下载最后的部分 last_1=total-page_idx*50 paper_list,total=search(last_1,keywords) forpinpaper_list: title=p['title'] href=HREF_CN+p['href'].split('/')[-1]+'.pdf' ifnotdownload(href,title): FAIL_URLS.append(p) time.sleep(1) pprint(FAIL_URLS) withopen(FAIL_URLS_TXT,'a+')asf: foriteminFAIL_URLS: href=item['href'] title=item['title'] f.write(href+'\n') print('done.')
以上就是python自动从arxiv下载paper的示例代码的详细内容,更多关于python从arxiv下载paper的资料请关注毛票票其它相关文章!