京东商品信息及其价格爬虫

python 2.7

# -*- coding:utf-8 -*-  

#导入模块
import urllib2,re,urllib
from bs4 import BeautifulSoup
import json,time
import sys  
reload(sys)  
sys.setdefaultencoding('utf8') 

fout = open(r'res.txt', "wb")
tot = 0

#定义抓取类
class JD:
    #记录抓取产品个数
    prodNum = 1
    #初始化参数
    def __init__(self,baseurl,page):
        self.baseurl = baseurl
        self.page = page
        #拼装成url
        self.url = self.baseurl+'&'+'page='+str(self.page)

    def getHtml(self,url):
        try:
            #请求抓取对象
            request = urllib2.Request(url)
            #响应对象
            reponse = urllib2.urlopen(request)
            #读取源代码
            html = reponse.read()
            #返回源代码
        except:
            time.sleep(0.1)
            return self.getHtml(url)
        return html


    #获取总页数
    def getNum(self,html):
        #封装成BeautifulSoup对象
        soup = BeautifulSoup(html)
        #定位到总页数节点
        items = soup.find_all('span',class_='p-skip')
        #获取总页数
        for item in items:
            pagenum = item.find('em').find('b').string
        return pagenum

    #获取所有产品id列表
    def getIds(self,html):
        #生成匹配规则
        pattern =  re.compile('<a target="_blank" href="//item.jd.com/(.*?).html".*?>')
        #查询匹配对象
        items = re.findall(pattern,html)
        return items

    #根据产品id获取同款产品列表
    def getIdByItems(self,id):
        #拼装成url
        url = basePd+str(id)+'.html'
        #调用抓取函数返回源代码
        html = self.getHtml(url)
        # 封装成BeautifulSoup对象
        soup = BeautifulSoup(html)
        #查询匹配对象
        items = []
        items = soup.find('div',class_='dd clearfix')
        l = []
        #生成列表

        for item in items:
            pattern = re.compile('href="//item.jd.com/(.*?).html".*?>')
            id = re.findall(pattern,str(item))
            if id:
                l += id
        return l

    #获取产品价格
    def getPrice(self,id):
        url = 'http://p.3.cn/prices/mgets?skuIds=J_'+str(id)
        jsonString = self.getHtml(url)
        jsonObject = json.loads(jsonString.decode())
        price_jd = jsonObject[0]['p']
        price_mk = jsonObject[0]['m']
        fout.write('jd price:'+str(price_jd)+'\n')
        fout.write('market price:'+str(price_mk)+'\n')

    #获取产品图片
    def getImg(self,html,subid):
        '''
        pattern = re.compile(r'<img id=.*?data-origin="(.*?)" alt=.*?', re.S)
        items = re.findall(pattern, html)
        for item in items:
            imgurl = 'http:%s' % (item)
            urllib.urlretrieve(imgurl, 'd:/temp/jdimg/%s.jpg' % (str(subid)))
            '''

    #获取内容
    def getContent(self,html,subid):
        soup = BeautifulSoup(html)
        title = soup.find('div',class_='sku-name')
        fout.write('\n-----------------'+ str(JD.prodNum) +'--------------------\n')
        try:
            for t in title:
                fout.write('name:'+t.string+'\n')
        except:
            return
        time.sleep(1)
        #价格
        self.getPrice(subid)
        #编码
        items1 = soup.find_all('ul',class_='parameter1 p-parameter-list')
        #商品基本信息
        for item in items1:
            p = item.findAll('p')
            for i in p:
                i.string=""
        # 商品基本信息
        items2 = soup.find_all('ul', class_='parameter2 p-parameter-list')
        for item in items2:
            p = item.findAll('li')
            if len(str(p[0].string))>0:
                fout.write(str(p[0].string))
            fout.write('\n')
            '''
            for i in p:
                if len(str(i.string))>0:
                    fout.write(str(i.string))
                fout.write('\n')
            '''
        #规格与包装
        '''
        items3 = soup.find_all('div',class_='Ptable-item')
        for item in items3:
            contents1 = item.findAll('dt')
            contents2 = item.findAll('dd')
            for i in range(len(contents1)):
                if len(str(contents1[i].string))>0 and len(str(contents2[i].string))>0:
                    fout.write(contents1[i].string)
                    if len(str(contents2[i].string))>0:
                        fout.write(str(contents2[i].string))
                    fout.write('\n')
        '''
        JD.prodNum += 1
        print JD.prodNum

    #启动抓取程序
    def start(self):
        html = spider.getHtml(self.url)
        pageNum = self.getNum(html)
        print 'doing............'
        #time.sleep(3)
        print 'finish. all',pageNum,'pages'
        #time.sleep(1)
        print 'doing.........'
        #循环1--页数
        for page in range(1,int(pageNum)+1):
            url = self.baseurl+'&'+'page='+str(page)
            html = self.getHtml(url)
            ids = self.getIds(html)
            #循环2--产品列表
            for id in ids:
                urlprod = basePd+str(id)+'.html'
                htmlprod = self.getHtml(urlprod)
                '''
                subids = self.getIdByItems(id)
                '''
                self.getContent(htmlprod,id)
                self.getImg(htmlprod,id)
                '''
                #循环3--产品组列表
                for subid in subids:
                    urlsubprod = basePd+str(subid)+'.html'
                    subhtml = self.getHtml(urlsubprod)
                    time.sleep(1)
                    self.getContent(subhtml,subid)
                    self.getImg(subhtml,subid)
                '''


#产品列表base页
basePd  = 'http://item.jd.com/'
#抓取入口URL
baseURL = 'http://list.jd.com/list.html?cat=9987,653,655'
#生成爬虫抓取对象
spider = JD(baseURL,1)

#开始抓取
spider.start()
posted @ 2017-05-07 11:07  qscqesze  阅读(884)  评论(0编辑  收藏  举报