初学爬虫之爬取某电商IPHONE信息
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/12/30 10:51 # @Site : # @File : jd_iphone.py # @Software: PyCharm import os import json import time import urllib3 import logging import requests import xlwt # import xlsxwriter as xlwt from pyquery import PyQuery from selenium import webdriver urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning headers = { "Referer": "https://search.jd.com/", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", } #控制台日志输出 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger_page = logging.getLogger("jd_iphone_page") logger_detail = logging.getLogger("jd_iphone_detail") def get_page_detail(maxp): product_list = [] p_no = 0 for page in range(1,2*maxp,2): url = 'https://search.jd.com/Search?keyword=iphoneapple&page='+str(page)+'&click=0'#筛选iPhone手机链接 # resp = requests.get(url,headers=headers,verify=False) #浏览器无窗模式 chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options) # driver = webdriver.Chrome() driver.get(url) driver.implicitly_wait(10) #执行js滚动条 js = ''' timer = setInterval(function(){ var scrollTop=document.documentElement.scrollTop||document.body.scrollTop; var ispeed=Math.floor(document.body.scrollHeight / 100); if(scrollTop > document.body.scrollHeight * 90 / 100){ clearInterval(timer); } console.log('scrollTop:'+scrollTop) console.log('scrollHeight:'+document.body.scrollHeight) window.scrollTo(0, scrollTop+ispeed) }, 20) ''' driver.execute_script(js) time.sleep(5) html = driver.page_source doc = PyQuery(html,parser="html")#无参数parser="html"则不能解析,pyquery解析的是html类型的字符串,但是上面的类型是xhtml logger_page.info("正在获取%s页数据......"%((page+1)/2)) for item in doc("#J_goodsList li").items(): tmp_list = [] key_list = []#验证关键字,去除不带iphone信息的数据 #获取价格 data_sku = item.attr('data-sku') para = ".J_%s"%data_sku price = item.find(para).text() #获取店铺 shop = item.find('.J_im_icon').text() #标签 tag_p = "#J_pro_%s"%data_sku tag = item.find(tag_p).text() #商品链接 item = item.find(".gl-i-wrap div a") for font in item.find('font').items(): key_list.append(font.text()) href = item.attr('href') #判断是否是苹果手机 if (u'苹果'in key_list) or(u'iphone'in key_list) or(u'Apple'in key_list) or(u'apple'in key_list): if "http:" in href: href = href[5:] tmp_list.append(href) tmp_list.append(price) tmp_list.append(shop) tmp_list.append(tag) product_list.append(tmp_list) else: tmp_list.append(href) tmp_list.append(price) tmp_list.append(shop) tmp_list.append(tag) product_list.append(tmp_list) p_no+=1 logger_page.info('正在获取%s页,第%s个产品信息......'%(((page+1)/2),p_no)) else: continue # print(product_list) # print(len(product_list)) return product_list def save_excel(contents,workbook,worksheet): '''contents,待保存数据以列表中嵌套字典格式传入,即[{},{},{}] workbook,实例化的表格, worksheet,实例化的表格sheet 一次项插入所有数据 ''' now = time.strftime("%Y_%m_%d %H_%M_%S") for c in range(0,len(contents)): content = contents[c]#dic key_list = (content.keys()) value_list = (content.values()) if c== 0 : for k in range(0,len(key_list)): worksheet.write(0,k,key_list[k])#取key值作为表头 for vk in range(0,len(value_list)):#写入数据 # print(value_list[vk]) worksheet.write(c+1, vk, value_list[vk]) logger_detail.info("第%s条信息保存成功,success!!!") workbook.save( os.path.dirname(os.path.abspath('.'))+'\\result_data\\JD_iphone_'+now+'.xls') def product_detail(list): now = time.strftime("%Y_%m_%d %H_%M_%S") no = 0 product_info = [] # 创建一个workbook 设置编码 workbook = xlwt.Workbook() # 创建一个worksheet worksheet = workbook.add_sheet('Jd_iphone') for link in list: try: url = 'http:'+link[0] logger_detail.info("正在获取第%s条信息......"%(no+1)) detail_html = requests.get(url,verify=False) doc = PyQuery(detail_html.text,parser="html") product_dic = { "Title":doc(".itemInfo-wrap div.sku-name").text(), "JD_price":list[no][1], "Shop":list[no][2], "Tag":list[no][3], "Colour":doc("#choose-attr-1 div.item").text(), "RAM":doc("#choose-attr-2 div.item").text(), "Style":doc("#choose-attr-3 div.item").text(), "Link":url # "increment":doc("#summary-support div span").text() } #逐条插入数据,函数save_excel一次性插入数据 key_list = (product_dic.keys()) value_list = (product_dic.values()) if no == 0 :#取字典的key作为表头 for k in range(0,len(key_list)): worksheet.write(0,k,key_list[k]) no+=1 for vk in range(0,len(value_list)):#按行逐条写入数据 worksheet.write(no, vk, value_list[vk])#行、列、取值 workbook.save( os.path.dirname(os.path.abspath('.'))+'\\result_data\\JD_iphone_'+now+'.xls') logger_detail.info("第%s条信息保存成功,success!!!"%(no)) except Exception as e: logger_detail.info("第%s条信息保存失败\n失败原因:%s"%(no,e)) continue # workbook.save( os.path.dirname(os.path.abspath('.'))+'\\result_data\\JD_iphone_'+now+'.xls') # product_info.append(product_dic) # print("第%s条iphone信息:"%(no+1)) # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False)) # save_excel(product_info,workbook,worksheet)#批量插入数据 # print("第%s条iphone信息保存成功!!!:"%(no)) return product_info # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False))#字典中文输出 if __name__ == '__main__': # # 创建一个workbook 设置编码 # workbook = xlwt.Workbook() # # 创建一个worksheet # worksheet = workbook.add_sheet('iphone_data') # # worksheet.write(0,0,'hello') # # workbook.save( os.path.dirname(os.path.abspath('.'))+'\\result_data\\JD_iphone.xls') # product_dic = [{ # "title":1234, # "jd_price":5678, # }, # {"title":"fdf dssd", # "jd_price":"sdff", # }] # save_excel(product_dic,workbook,worksheet) list = get_page_detail(1) product_detail(list) # print(json.dumps(list,encoding='UTF-8', ensure_ascii=False)) # reasult = product_detail(list) # print (json.dumps(reasult,encoding='UTF-8', ensure_ascii=False))
所有转载均用于学习,不作商业用途!!!
浙公网安备 33010602011771号