Python通过解析网页实现看报程序的方法
本文所述实例可以实现基于Python的查看图片报纸《参考消息》并将当天的图片报纸自动下载到本地供查看的功能,具体实现代码如下:
#coding=gbk
importurllib2
importsocket
importre
importtime
importos
#timeoutinseconds
#timeout=10
#socket.setdefaulttimeout(timeout)
timeout=10
urllib2.socket.setdefaulttimeout(timeout)
home_url="http://www.hqck.net"
home_page=""
try:
home_page_context=urllib2.urlopen(home_url)
home_page=home_page_context.read()
print"Readhomepagefinishd."
print"-------------------------------------------------"
excepturllib2.URLError,e:
printe.code
exit()
except:
printe.code
exit()
reg_str=r'<aclass="item-baozhi"href="/arc/jwbt/ckxx/\d{4}/\d{4}/\w+\.html"rel="externalnofollow"><spanclass.+>.+</span></a>'
news_url_reg=re.compile(reg_str)
today_cankao_news=news_url_reg.findall(home_page)
iflen(today_cankao_news)==0:
print"Cannotfindtoday'snews!"
exit()
my_news=today_cankao_news[0]
print"Latestnewslink="+my_news
print
url_s=my_news.find("/arc/")
url_e=my_news.find(".html")
url_e=url_e+5
print"Linkindex=["+str(url_s)+","+str(url_e)+"]"
my_news=my_news[url_s:url_e]
print"parturl="+my_news
full_news_url=home_url+my_news
print"fullurl="+full_news_url
print
image_folder="E:\\new_folder\\"
if(os.path.exists(image_folder)==False):
os.makedirs(image_folder)
today_num=time.strftime('%Y-%m-%d',time.localtime(time.time()))
image_folder=image_folder+today_num+"\\"
if(os.path.exists(image_folder)==False):
os.makedirs(image_folder)
print"Newsimagefolder="+image_folder
print
context_uri=full_news_url[0:-5]
first_page_url=context_uri+".html"
try:
first_page_context=urllib2.urlopen(first_page_url)
first_page=first_page_context.read()
excepturllib2.HTTPError,e:
printe.code
exit()
tot_page_index=first_page.find("共")
tot_page_index=tot_page_index
tmp_str=first_page[tot_page_index:tot_page_index+10]
end_s=tmp_str.find("页")
page_num=tmp_str[2:end_s]
printpage_num
page_count=int(page_num)
print"Total"+page_num+"pages:"
print
page_index=1
download_suc=True
whilepage_index<=page_count:
page_url=context_uri
ifpage_index>1:
page_url=page_url+"_"+str(page_index)
page_url=page_url+".html"
print"Newspagelink="+page_url
try:
news_img_page_context=urllib2.urlopen(page_url)
excepturllib2.URLError,e:
printe.reason
download_suc=False
break
news_img_page=news_img_page_context.read()
#f=open("e:\\page.html","w")
#f.write(news_img_page)
#f.close()
reg_str=r'http://image\S+jpg'
image_reg=re.compile(reg_str)
image_results=image_reg.findall(news_img_page)
iflen(image_results)==0:
print"Cannotfindnewspage"+str(page_index)+"!"
download_suc=False
break
image_url=image_results[0]
print"Newsimageurl="+image_url
news_image_context=urllib2.urlopen(image_url)
image_name=image_folder+"page_"+str(page_index)+".jpg"
imgf=open(image_name,'wb')
print"Gettingimage..."
try:
whileTrue:
date=news_image_context.read(1024*10)
ifnotdate:
break
imgf.write(date)
imgf.close()
except:
download_suc=False
print"Saveimage"+str(page_index)+"failed!"
print"Unexpectederror:"+sys.exc_info()[0]+sys.exc_info()[1]
else:
print"Saveimage"+str(page_index)+"succeed!"
print
page_index=page_index+1
ifdownload_suc==True:
print"Newsdownloadsucceed!Path=\""+str(image_folder)+"\""
print"Enjoyit!^^"
else:
print"newsdownloadfailed!"