京东手机信息爬取(全部手机)

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

仅学习参考,不可用于商业用途

version_0

说明:单线程爬虫,使用模块为python自带模块,包括urllib,json等

   写这个爬虫是为了熟悉urllib的基本使用,包括常用函数.urllib.build_opener()、urllib.parse.urljoin、urllib.parse.quote、urllib.request.urlopen

   urllib.request.install_opener()、http.cookiejar、urllib.request.HTTPHandler()、urllib.request.HTTPCookiesProcessor()

   请求频率通过random.uniform(),随机选取

   本爬虫目前只支持获取手机页面的信息。

   所有的图片信息,以链接方式保存。可以使用urllib.request.urlretrieve()下载。

   若要构造多线程爬虫,请参考:https://www.cnblogs.com/nuochengze/p/12861358.html

效果预览:

  

源码如下:

from urllib import request
from urllib import parse
from urllib import error
from http import cookiejar
import re
from pprint import pprint
import time
import random
import json


class JdPhoneInfo(object):
    def __init__(self,key_word):
        self.key_word = key_word

    def get_url(self,key_word,page_num,page_count):
        url_list = list()
        url_base = "https://search.jd.com/s_new.php?keyword=%E6%89%8B%E6%9C%BA&page=2&s=30"
        while page_num<page_count:
            info = {
                "keyword":key_word,
                "page":page_num+1,
                "s":page_num*30,
            }
            url_ = "s_new.php?"+parse.urlencode(info)
            url = parse.urljoin(base=url_base,url=url_)
            url_list.append(url)
            page_num += 1
        return url_list

    def parse_info(self,html_str):
        """获取整页的响应信息,包括page_count,page_current"""
        page_info = dict()
        # 获取页面总数
        page_count = re.compile(r'page_count:\"(.*?)\"',re.S).findall(html_str)
        page_info["page_count"] = int(page_count[0]) if page_count else None
        # 获取页面当页数
        page_current = re.compile(r'page:"(.*?)",page_count',re.S).findall(html_str)
        page_info["page_current"] = int(page_current[0]) if page_count else None
        # 获取所有的产品信息
        page_info["product_list"] = list()
        product_info_list = re.compile(r'class="p-img"(.*?)class="p-icons"', re.S).findall(html_str)
        ## 获取单个产品的信息
        for one_product_info in product_info_list:
            info = dict()
            # 获取标题及链接
            str_ = re.compile(r'p-name p-name-type-2(.*?)</div>',re.S).findall(one_product_info)[0]
            title = re.compile(r'em>(.*?)</em>',re.S).findall(str_)
            info["title"] =re.sub(r'\n|\t|\s|(<.*?>)','',title[0]).strip() if title else None
            href = re.compile(r'href="(.*?)"',re.S).findall(str_)
            info["href"] = "https:"+href[0] if href else None
            # 获取价格
            str_ = re.compile(r'class="p-price"(.*?)</div>',re.S).findall(one_product_info)[0]
            price = re.compile(r'i>(.*?)</i>', re.S).findall(str_)
            info["price"] = price[0] if price else None
            # 获取图片
            info["pic_info"] = list()
            img_list = re.compile(r'class="ps-item">(.*?)</li>',re.S).findall(one_product_info)
            for img in img_list:
                pic_info_ = dict()
                pic_title = re.compile(r'title="(.*?)">',re.S).findall(img)
                pic_info_["pic_title"] = pic_title[0] if pic_title else None
                pic_href = re.compile(r'data-lazy-img="(.*?)"',re.S).findall(img)
                pic_info_["pic_href"] = "https:"+pic_href[0] if pic_href else "---"
                info["pic_info"].append(pic_info_)
            # 获取评价连接
            info["comment_href"] = info["href"]+"#comment"
            # 获取售卖店铺及链接
            info["store"] = dict()
            str_ = re.compile(r'class="p-shop"(.*?)</div>',re.S).findall(one_product_info)[0]
            shop_name = re.compile(r'title="(.*?)"',re.S).findall(str_)
            info["store"]["shop_name"] = shop_name[0] if shop_name else None
            shop_href = re.compile(r'href="(.*?)"', re.S).findall(str_)
            info["store"]["shop_href"] = "https:"+shop_href[0] if shop_href else None
            # 将单个产品添加到产品列表
            page_info["product_list"].append(info)
        return page_info

    def get_request(self,first_url,url=None,url_index_num=None,url_list=None):
        # 构造cookie_handler和https_handler处理器
        cookjar_ = cookiejar.CookieJar()
        cookie_handler = request.HTTPCookieProcessor(cookjar_)
        https_handler = request.HTTPSHandler()
        opener = request.build_opener(cookie_handler, https_handler)
        request.install_opener(opener)
        use_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
        if url_list is not None:
            request_ = request.Request(url=url)
            request_.add_header("User-Agent", use_agent)
            if url_index_num == 0:
                request_.add_header(key="referer",val=first_url)
            else:
                request_.add_header(key="referer",val=url_list[url_index_num-1])
        else:
            # Request实例
            request_ = request.Request(url=first_url)
            # 添加header
            request_.add_header("User-Agent", use_agent)
        response_ = request.urlopen(request_)
        return response_
    
    def save_content(self,info):
        with open("jindong_phone_info.json",'a+',encoding='utf8') as f:
            f.write(json.dumps(info,ensure_ascii=False,indent=2))
            print("当前写入url",info["page_current"])

    def run(self):

        first_url = "https://search.jd.com/Search?keyword={}".format(parse.quote(self.key_word))
        # 获取页面的总页数
        ## 请求第一页
        first_response_html = self.get_request(first_url=first_url).read().decode()
        ## 提取信息
        page_info = self.parse_info(first_response_html)    # page_info接收一个字典
        # 保存内容
        self.save_content(page_info)
        # 获取构造的所有url
        url_list = self.get_url(self.key_word,page_num=page_info["page_current"],page_count=page_info["page_count"])
        for url in url_list:
            response_html = self.get_request(first_url=first_url,url=url,url_list=url_list,url_index_num=url_list.index(url)).read().decode()
            page_info = self.parse_info(response_html)
            # 保存内容
            self.save_content(page_info)
            num = random.uniform(1,2)
            time.sleep(num)
            
    
if __name__=="__main__":
    # key_word = input("请输入关键字:")
    key_word = "手机"
    print("本程序将采集以下信息:标题及连接,价格,图片,评价连接,售卖店铺及链接")
    obj = JdPhoneInfo(key_word)
    obj.run()

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

posted @ 2020-06-04 15:55  Norni  阅读(594)  评论(0编辑  收藏  举报