Python爬取国外天气预报网站的方法
本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:
crawl_weather.py如下:
#encoding=utf-8
importhttplib
importurllib2
importtime
fromthreadingimportThread
importthreading
fromQueueimportQueue
fromtimeimportsleep
importre
importcopy
lang="fr"
count=0
classLocation:
#Location(False,"中国","北京","zh")
#Location(True,"","亚洲","zh")
def__init__(self,is_beyond_country,country_name,loc_name,lang):
self.country_name=country_name
self.loc_name=loc_name
self.lang=lang
self.is_beyond_country=is_beyond_country
prn_lock=threading.RLock()
defGetLocationURLs(url,recursive):
globalcount
ifurl.find("weather-forecast")!=-1:
count=count+1
ifcount%500==0:
prn_lock.acquire()
print"count:%d"%(count)
prn_lock.release()
return[url]
page=urllib2.urlopen(url).read()
time.sleep(0.01)
#"<h6><ahref=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
pattern="<h6><ahref=\"(.*)\"><em>(.*)</em></a></h6>"
locs=re.findall(pattern,page)
locs=[(url,name)forurl,nameinlocsifurl.find("browse-locations")!=-1orurl.find("weather-forecast")!=-1]
ifnotrecursive:
urls=[urlforurl,nameinlocs]
returnurls
urls=[]
for_url,_nameinlocs:
lst=GetLocationURLs(_url,True)
urls.extend(lst)
returnurls
#entry_url="http://www.accuweather.com/zh/browse-locations"
entry_url="http://www.accuweather.com/%s/browse-locations/eur/fr"%(lang)
#regions=["afr","ant","arc","asi","cac","eur","mea","nam","ocn","sam"]
#regions=["eur"]
#region_urls=["%s/%s"%(entry_url,reg)forreginregions]
#region_urls=["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls=GetLocationURLs(entry_url,False)
printlen(sub_urls)
printsub_urls
q=Queue()
location_urls=[]
ThreadNum=5
lock=threading.RLock()
forurlinsub_urls:
q.put(url)
defworking():
whileTrue:
url=q.get()
lst=GetLocationURLs(url,True)
print"%s%durls"%(url,len(lst))
lock.acquire()
location_urls.extend(lst)
lock.release()
q.task_done()
foriinrange(ThreadNum):
t=Thread(target=working)
t.setDaemon(True)
t.start()
q.join()
fp=open('locations.txt',"w")
fp.write("\n".join(location_urls))
fp.close()
#forurlinlocation_urls:
#printurl
#location_urls=GetLocationURLs(entry_url)
'''
defFetch(url):
try:
printurl
web_path=url[0]
local_name=url[1]
print"web_path:",web_path
print"local_name:",local_name
sContent=urllib2.urlopen(web_path).read()
savePath="D:\\Course\\NLP_Manning\\%s"%(local_name)
printsavePath
file=open(savePath,'wb')
file.write(sContent)
file.close()
printsavePath+"saved";
except:
pass;
defworking():
whileTrue:
url=q.get()
Fetch(url)
sleep(10)
q.task_done()
#root_url="https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url="https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page=urllib2.urlopen(root_url).read()
foriinrange(NUM):
t=Thread(target=working)
t.setDaemon(True)
t.start()
urls=copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
printlen(ppt_urls)
printlen(srt_urls)
printlen(video_urls)
printlen(urls)
forurlinurls:
q.put(url)
q.join()
'''
'''
root_url="http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page=urllib2.urlopen(root_url).read()
printpage
'''
FetchLocation.py如下:
#encoding=utf-8
importsys
importhttplib
importurllib2
importtime
fromthreadingimportThread
importthreading
fromQueueimportQueue
fromtimeimportsleep
importre
importcopy
fromxml.domimportminidom
importHTMLParser
importdatetime
q=Queue()
locks=[threading.RLock()foriinrange(2)]
ThreadNumber=20
locations={}
conds={}
defFindCountryBreadCrumbs(page):
lines=page.splitlines()
count=0
start=-1
opened=False
forlineinlines:
ifline.find("<ulid=\"country-breadcrumbs\">")!=-1:
start=count
opened=True
ifopenedandline.find("</ul>")!=-1:
end=count
opened=False
count=count+1
return"\n".join(lines[start:(end+1)])
defGetText(nodelist):
rc=[]
fornodeinnodelist:
ifnode.nodeType==node.TEXT_NODE:
rc.append(HTMLParser.HTMLParser().unescape(node.data))
return''.join(rc)
defFindCondition(page):
pat="<spanclass=\"cond\">(.*?)</span>"
cds=re.findall(pat,page)
cds=[HTMLParser.HTMLParser().unescape(cd).encode("utf-8")forcdincds]
returncds
defExtractInfo(url):
try:
page=urllib2.urlopen(url).read()
exceptException,e:
return[]
text=FindCountryBreadCrumbs(page)
text=HTMLParser.HTMLParser().unescape(text)
dom=minidom.parseString(text.encode("utf-8"))
locs=[]
lis=dom.getElementsByTagName("li")
forliinlis:
adr_list=li.getElementsByTagName("a")
ifadr_list:
locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
strs=li.getElementsByTagName("strong")
ifstrs:
locs.append(GetText(strs[0].childNodes).encode("utf-8"))
cds=FindCondition(page)
returnlocs,cds
defAddMap(lst,m):
forxinlst:
ifm.get(x)==None:
m[x]=1
defworking():
whileTrue:
urls=q.get()
#printlen(urls)
m={}
m2={}
count=0
forurlinurls:
count=count+1
#print"%d/%d"%(count,len(urls))
locs,cds=ExtractInfo(url)
AddMap(locs,m)
AddMap(cds,m2)
locks[1].acquire()
AddMap(m.keys(),locations)
AddMap(m2.keys(),conds)
locks[1].release()
q.task_done()
defmain():
iflen(sys.argv)<2:
exit()
loc_path=sys.argv[1]
fp=open(loc_path,"r")
urls=[line.strip()forlineinfp]
fp.close()
#urls=urls[0:1000]
blocks=len(urls)/ThreadNumber+1
forstartinrange(0,len(urls),blocks):
end=start+blocks
ifend>len(urls):
end=len(urls)
q.put(urls[start:end])
foriinrange(ThreadNumber):
t=Thread(target=working)
t.setDaemon(True)
t.start()
q.join()
fp=open("location_name.fr","w")
fp.write("\n".join(locations.keys()))
fp.close()
fp=open("conditions.fr","w")
fp.write("\n".join(conds.keys()))
fp.close()
if__name__=='__main__':
main()
希望本文所述对大家的python程序设计有所帮助。