一听音乐歌曲抓取

# coding: utf-8

from bs4 import BeautifulSoup
import urllib2
import re


#http://www.1ting.com/player/d3/player_1089440.html

class MySpider(object):

    def getHtml(self,url):
        '''
        获取页面源代码
        '''
        req=urllib2.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.94 Safari/537.36')
        res=urllib2.urlopen(req).read()
        return res


    def get_allrank(self,res):
        '''
        allrank
        '''
        soup=BeautifulSoup(res)
        allrank=soup.find_all('div',class_='allrank')
        return allrank
    
    def get_li(self,rank):
        li=rank.find_all('li')
        return li
    
    def musicInfo(self,base_url,res):
        '''
        把需要的信息写入到字典中去
        '''
        musicInfo={}
        musicInfo['title']=res.a.string.encode('utf8')
        musicInfo['url']=base_url+res.a.attrs['href']
        return musicInfo

    def saveToText(self,content):
        '''
        把文件保存到硬盘上
        '''
        f=open('music.txt','a')
        try:
            for each in content:
                f.writelines(r'音乐名字:'+each['title']+'\n')
                f.writelines(r'音乐地址:'+each['url']+'\n\n')
        finally:
            f.close()




def main():
    lis=[]
    base_url='http://www.1ting.com'
    rank_url='http://www.1ting.com/rank.html'
    ms=MySpider()
    res=ms.getHtml(rank_url)
    allrank=ms.get_allrank(res)
    try:
        for rank in allrank:
            results=ms.get_li(rank)
            for res in results:
                music=ms.musicInfo(base_url,res)
                lis.append(music)
        ms.saveToText(lis)
    except Exception,e:
        print str(e)
        exit(0)


if __name__=='__main__':
    main()

结果:

posted @ 2016-06-12 14:52  kennyhip  阅读(225)  评论(0)    收藏  举报