Python实现提取谷歌音乐搜索结果的方法
本文实例讲述了Python实现提取谷歌音乐搜索结果的方法。分享给大家供大家参考。具体如下:
Python的简单脚本,用于提取谷歌音乐搜索页面中的歌曲信息,包括歌曲名,作者,专辑名,现在链接等,最多只提取10页结果。
#!/usr/bin/envpython #coding=utf-8 ''' Createdon2011-8-19 @author:yaoboyuan ''' fromurllibimportrequest,parse importre,sys defextractSongRawData(text): '抓取每一首歌的原始数据' text=re.sub('\n+','',text) songList=re.findall('\<tbody.*?\</tbody>',text) nums=len(songList) print('search'+str(nums)+'songs') returnsongList deftranslate(text): '去掉text中的无用字符,转换unicode码' text=re.sub('\<b>','',text) text=re.sub('\</b>','',text) #findthe成andtranslateintochinese s=re.findall('&#([0-9]+);',text) iflen(s)<=0: returntext else: text='' foriinrange(len(s)): value=int(s[i],10)#fromstr'123'to10base'sint124 text+=chr(value)#fromvaluetochar returntext defextractSongName(song): '提取歌曲名字' td=re.findall('(?:\<tdclass\="Title).*(?:\</td>)',song) name=re.findall('.+?\<a.+?>(.*?)\</a>',td[0]) songName=translate(name[0]) returnsongName defextractAuthorName(song): '提取作者名字' td=re.findall('(?:\<tdclass\="Artist).*(?:\</td>)',song) name=re.findall('.+?\<a.+?>(.*?)\</a>',td[0]) authorName=name[0] authorName=translate(authorName) returnauthorName defextrackAlbumName(song): '提取专辑名字' td=re.findall('(?:\<tdclass\="Album).*(?:\</td>)',song) name=re.findall('.+?\<a.+?>(.*?)\</a>',td[0]) albumName=translate(name[0]) returnalbumName defextractID(song): '提取歌曲id' td=re.findall('''\<tbodyid\="([a-zA-Z0-9]+)"''',song) iflen(td)>0: returntd[0] else: returnsong defextractLink(song): '提取歌曲下载链接' td=re.findall('''\<tdclass\="Icon.*?(?=title\="下载").*?onclick\="(.*?)>''',song) iflen(td)==0: return'NULL' s=str(td[0]) rawLink=re.findall('http.*?(?=\?)',s) iflen(rawLink)==0: returns link=rawLink[0] link=re.sub('%3D','=',link) id=extractID(song) returnlink+'?id='+id defextractPageNums(text): '提取返回结果的页数,最多要10页' pageList=re.findall('page_link',text) returnlen(pageList) defextractSongInfo(song): '提取歌曲信息,返回歌曲列表' songList=[] foriinrange(len(song)): songName=extractSongName(song[i]) authorName=extractAuthorName(song[i]) albumName=extrackAlbumName(song[i]) link=extractLink(song[i]) songItem=[songName,authorName,albumName,link] songList.append(songItem) index='' ifi<9: index='0'+str(i+1) else: index=str(i+1) #print(index+''+songName+''+authorName+''+albumName+''+link) returnsongList defmain(): whileTrue: url='http://www.google.cn/music/search?q=' key=input('请输入歌曲名字或关键字:') key=parse.quote(key)#统一编码成utf-8 url+=key mf=request.urlopen(url) c=mf.readall() c=str(c,encoding='utf-8') num=extractPageNums(c) print(str(num+1)+'pagesfound') song=extractSongRawData(c) songList=extractSongInfo(song) #iftheresultgreatthan2pages,thenrequestallpages ifnum>0: foriinrange(num): start=(i+1)*20 next_page='&cat=song&start=%d'%(start) #next_page=parse.quote(next_page)#统一编码成utf-8 url+=next_page mf=request.urlopen(url) c=mf.readall() c=str(c,encoding='utf-8') song=extractSongRawData(c) songList+=extractSongInfo(song)#findallresults foriinrange(len(songList)):#printtheresult index='' ifi<9: index='0'+str(i+1) else: index=str(i+1) print(index+''+str(songList[i])) if__name__=='__main__': main()
希望本文所述对大家的Python程序设计有所帮助。