一听音乐歌曲抓取
# coding: utf-8 from bs4 import BeautifulSoup import urllib2 import re #http://www.1ting.com/player/d3/player_1089440.html class MySpider(object): def getHtml(self,url): ''' 获取页面源代码 ''' req=urllib2.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.94 Safari/537.36') res=urllib2.urlopen(req).read() return res def get_allrank(self,res): ''' allrank ''' soup=BeautifulSoup(res) allrank=soup.find_all('div',class_='allrank') return allrank def get_li(self,rank): li=rank.find_all('li') return li def musicInfo(self,base_url,res): ''' 把需要的信息写入到字典中去 ''' musicInfo={} musicInfo['title']=res.a.string.encode('utf8') musicInfo['url']=base_url+res.a.attrs['href'] return musicInfo def saveToText(self,content): ''' 把文件保存到硬盘上 ''' f=open('music.txt','a') try: for each in content: f.writelines(r'音乐名字:'+each['title']+'\n') f.writelines(r'音乐地址:'+each['url']+'\n\n') finally: f.close() def main(): lis=[] base_url='http://www.1ting.com' rank_url='http://www.1ting.com/rank.html' ms=MySpider() res=ms.getHtml(rank_url) allrank=ms.get_allrank(res) try: for rank in allrank: results=ms.get_li(rank) for res in results: music=ms.musicInfo(base_url,res) lis.append(music) ms.saveToText(lis) except Exception,e: print str(e) exit(0) if __name__=='__main__': main()
结果:


浙公网安备 33010602011771号