Python实现提取谷歌音乐搜索结果的方法
本文实例讲述了Python实现提取谷歌音乐搜索结果的方法。分享给大家供大家参考。具体如下:
Python的简单脚本,用于提取谷歌音乐搜索页面中的歌曲信息,包括歌曲名,作者,专辑名,现在链接等,最多只提取10页结果。
#!/usr/bin/envpython
#coding=utf-8
'''
Createdon2011-8-19
@author:yaoboyuan
'''
fromurllibimportrequest,parse
importre,sys
defextractSongRawData(text):
'抓取每一首歌的原始数据'
text=re.sub('\n+','',text)
songList=re.findall('\<tbody.*?\</tbody>',text)
nums=len(songList)
print('search'+str(nums)+'songs')
returnsongList
deftranslate(text):
'去掉text中的无用字符,转换unicode码'
text=re.sub('\<b>','',text)
text=re.sub('\</b>','',text)
#findthe成andtranslateintochinese
s=re.findall('&#([0-9]+);',text)
iflen(s)<=0:
returntext
else:
text=''
foriinrange(len(s)):
value=int(s[i],10)#fromstr'123'to10base'sint124
text+=chr(value)#fromvaluetochar
returntext
defextractSongName(song):
'提取歌曲名字'
td=re.findall('(?:\<tdclass\="Title).*(?:\</td>)',song)
name=re.findall('.+?\<a.+?>(.*?)\</a>',td[0])
songName=translate(name[0])
returnsongName
defextractAuthorName(song):
'提取作者名字'
td=re.findall('(?:\<tdclass\="Artist).*(?:\</td>)',song)
name=re.findall('.+?\<a.+?>(.*?)\</a>',td[0])
authorName=name[0]
authorName=translate(authorName)
returnauthorName
defextrackAlbumName(song):
'提取专辑名字'
td=re.findall('(?:\<tdclass\="Album).*(?:\</td>)',song)
name=re.findall('.+?\<a.+?>(.*?)\</a>',td[0])
albumName=translate(name[0])
returnalbumName
defextractID(song):
'提取歌曲id'
td=re.findall('''\<tbodyid\="([a-zA-Z0-9]+)"''',song)
iflen(td)>0:
returntd[0]
else:
returnsong
defextractLink(song):
'提取歌曲下载链接'
td=re.findall('''\<tdclass\="Icon.*?(?=title\="下载").*?onclick\="(.*?)>''',song)
iflen(td)==0:
return'NULL'
s=str(td[0])
rawLink=re.findall('http.*?(?=\?)',s)
iflen(rawLink)==0:
returns
link=rawLink[0]
link=re.sub('%3D','=',link)
id=extractID(song)
returnlink+'?id='+id
defextractPageNums(text):
'提取返回结果的页数,最多要10页'
pageList=re.findall('page_link',text)
returnlen(pageList)
defextractSongInfo(song):
'提取歌曲信息,返回歌曲列表'
songList=[]
foriinrange(len(song)):
songName=extractSongName(song[i])
authorName=extractAuthorName(song[i])
albumName=extrackAlbumName(song[i])
link=extractLink(song[i])
songItem=[songName,authorName,albumName,link]
songList.append(songItem)
index=''
ifi<9:
index='0'+str(i+1)
else:
index=str(i+1)
#print(index+''+songName+''+authorName+''+albumName+''+link)
returnsongList
defmain():
whileTrue:
url='http://www.google.cn/music/search?q='
key=input('请输入歌曲名字或关键字:')
key=parse.quote(key)#统一编码成utf-8
url+=key
mf=request.urlopen(url)
c=mf.readall()
c=str(c,encoding='utf-8')
num=extractPageNums(c)
print(str(num+1)+'pagesfound')
song=extractSongRawData(c)
songList=extractSongInfo(song)
#iftheresultgreatthan2pages,thenrequestallpages
ifnum>0:
foriinrange(num):
start=(i+1)*20
next_page='&cat=song&start=%d'%(start)
#next_page=parse.quote(next_page)#统一编码成utf-8
url+=next_page
mf=request.urlopen(url)
c=mf.readall()
c=str(c,encoding='utf-8')
song=extractSongRawData(c)
songList+=extractSongInfo(song)#findallresults
foriinrange(len(songList)):#printtheresult
index=''
ifi<9:
index='0'+str(i+1)
else:
index=str(i+1)
print(index+''+str(songList[i]))
if__name__=='__main__':
main()
希望本文所述对大家的Python程序设计有所帮助。