东篱野鹤

导航

初学爬虫之爬取某电商IPHONE信息

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/12/30 10:51
# @Site    : 
# @File    : jd_iphone.py
# @Software: PyCharm

import os
import json
import time
import urllib3
import logging
import requests
import xlwt
# import xlsxwriter as xlwt
from pyquery import PyQuery
from selenium import webdriver

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning
headers = {
        "Referer": "https://search.jd.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    }
#控制台日志输出
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger_page = logging.getLogger("jd_iphone_page")
logger_detail = logging.getLogger("jd_iphone_detail")

def get_page_detail(maxp):
    product_list = []
    p_no = 0
    for page in range(1,2*maxp,2):
        url = 'https://search.jd.com/Search?keyword=iphoneapple&page='+str(page)+'&click=0'#筛选iPhone手机链接
        # resp = requests.get(url,headers=headers,verify=False)
        #浏览器无窗模式
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        # driver = webdriver.Chrome()
        driver.get(url)
        driver.implicitly_wait(10)
        #执行js滚动条
        js = '''
        timer = setInterval(function(){
           var scrollTop=document.documentElement.scrollTop||document.body.scrollTop;
           var ispeed=Math.floor(document.body.scrollHeight / 100);
           if(scrollTop > document.body.scrollHeight * 90 / 100){
               clearInterval(timer);
           }
           console.log('scrollTop:'+scrollTop)
           console.log('scrollHeight:'+document.body.scrollHeight)
           window.scrollTo(0, scrollTop+ispeed)
        }, 20)
        '''
        driver.execute_script(js)
        time.sleep(5)
        html = driver.page_source
        doc = PyQuery(html,parser="html")#无参数parser="html"则不能解析,pyquery解析的是html类型的字符串,但是上面的类型是xhtml
        logger_page.info("正在获取%s页数据......"%((page+1)/2))
        for item in doc("#J_goodsList li").items():
            tmp_list = []
            key_list = []#验证关键字,去除不带iphone信息的数据
            #获取价格
            data_sku = item.attr('data-sku')
            para = ".J_%s"%data_sku
            price = item.find(para).text()
            #获取店铺
            shop = item.find('.J_im_icon').text()
            #标签
            tag_p = "#J_pro_%s"%data_sku
            tag = item.find(tag_p).text()
            #商品链接
            item = item.find(".gl-i-wrap div a")
            for font in item.find('font').items():
                key_list.append(font.text())
            href = item.attr('href')
            #判断是否是苹果手机
            if (u'苹果'in key_list) or(u'iphone'in key_list) or(u'Apple'in key_list) or(u'apple'in key_list):
                if "http:" in href:
                    href = href[5:]
                    tmp_list.append(href)
                    tmp_list.append(price)
                    tmp_list.append(shop)
                    tmp_list.append(tag)
                    product_list.append(tmp_list)
                else:
                    tmp_list.append(href)
                    tmp_list.append(price)
                    tmp_list.append(shop)
                    tmp_list.append(tag)
                    product_list.append(tmp_list)
                p_no+=1
                logger_page.info('正在获取%s页,第%s个产品信息......'%(((page+1)/2),p_no))
            else:
                continue
    # print(product_list)
    # print(len(product_list))
    return product_list

def save_excel(contents,workbook,worksheet):
    '''contents,待保存数据以列表中嵌套字典格式传入,即[{},{},{}]
        workbook,实例化的表格,
        worksheet,实例化的表格sheet
        一次项插入所有数据
    '''
    now = time.strftime("%Y_%m_%d %H_%M_%S")
    for c in range(0,len(contents)):
        content = contents[c]#dic
        key_list = (content.keys())
        value_list = (content.values())
        if c== 0 :
            for k in range(0,len(key_list)):
                worksheet.write(0,k,key_list[k])#取key值作为表头
        for vk in range(0,len(value_list)):#写入数据
            # print(value_list[vk])
            worksheet.write(c+1, vk, value_list[vk])
        logger_detail.info("第%s条信息保存成功,success!!!")
    workbook.save( os.path.dirname(os.path.abspath('.'))+'\\result_data\\JD_iphone_'+now+'.xls')

def product_detail(list):
    now = time.strftime("%Y_%m_%d %H_%M_%S")
    no = 0
    product_info = []
    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook()
    # 创建一个worksheet
    worksheet = workbook.add_sheet('Jd_iphone')
    for link in list:
        try:
            url = 'http:'+link[0]
            logger_detail.info("正在获取第%s条信息......"%(no+1))
            detail_html = requests.get(url,verify=False)
            doc = PyQuery(detail_html.text,parser="html")

            product_dic = {
                "Title":doc(".itemInfo-wrap div.sku-name").text(),
                "JD_price":list[no][1],
                "Shop":list[no][2],
                "Tag":list[no][3],
                "Colour":doc("#choose-attr-1 div.item").text(),
                "RAM":doc("#choose-attr-2 div.item").text(),
                "Style":doc("#choose-attr-3 div.item").text(),
                "Link":url
                # "increment":doc("#summary-support div span").text()
            }
            #逐条插入数据,函数save_excel一次性插入数据
            key_list = (product_dic.keys())
            value_list = (product_dic.values())
            if no == 0 :#取字典的key作为表头
                for k in range(0,len(key_list)):
                    worksheet.write(0,k,key_list[k])
            no+=1
            for vk in range(0,len(value_list)):#按行逐条写入数据
                worksheet.write(no, vk, value_list[vk])#行、列、取值
            workbook.save( os.path.dirname(os.path.abspath('.'))+'\\result_data\\JD_iphone_'+now+'.xls')
            logger_detail.info("第%s条信息保存成功,success!!!"%(no))
        except Exception as e:
            logger_detail.info("第%s条信息保存失败\n失败原因:%s"%(no,e))
            continue
    # workbook.save( os.path.dirname(os.path.abspath('.'))+'\\result_data\\JD_iphone_'+now+'.xls')
            # product_info.append(product_dic)
            # print("第%s条iphone信息:"%(no+1))
            # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False))
        # save_excel(product_info,workbook,worksheet)#批量插入数据
            # print("第%s条iphone信息保存成功!!!:"%(no))
    return product_info
        # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False))#字典中文输出

if __name__ == '__main__':
    # # 创建一个workbook 设置编码
    # workbook = xlwt.Workbook()
    # # 创建一个worksheet
    # worksheet = workbook.add_sheet('iphone_data')
    # # worksheet.write(0,0,'hello')
    # # workbook.save( os.path.dirname(os.path.abspath('.'))+'\\result_data\\JD_iphone.xls')
    # product_dic = [{
    #         "title":1234,
    #         "jd_price":5678,
    #     },
    #     {"title":"fdf dssd",
    #     "jd_price":"sdff",
    #     }]
    # save_excel(product_dic,workbook,worksheet)
    list = get_page_detail(1)
    product_detail(list)
    # print(json.dumps(list,encoding='UTF-8', ensure_ascii=False))
    # reasult = product_detail(list)
    # print (json.dumps(reasult,encoding='UTF-8', ensure_ascii=False))

posted on 2020-01-01 17:40  东篱野鹤  阅读(328)  评论(0)    收藏  举报