Loading

爬各大电商平台

根据搜索爬天猫、京东、当当、孔夫子的书籍信息

天猫

import requests
from bs4 import BeautifulSoup
import bs4

proxies = {
    'http':'<http://117.135.153.10:80>'
}

headers = {'User-Agent': 'Mozilla/5.0'}


def getHTMLText(url):   #通用 获取网页信息
    try:
        r = requests.get(url, headers=headers,proxies=proxies)   #设置代理 设置时间
        r.raise_for_status()
        r.encoding ='gbk';
    except:
        print("淘宝搜索失败")
        return ""
    return r.text


def getUnivList(allUnivList,text,Limit):

    bookInfo = []  # 每件商品的信息 0价格  1  名字 2 评论数量   3图片     4 店名   5 平台  6物品详细界面 7筛选  8商品ID

    soup = BeautifulSoup(text, "html.parser")         #直接获取每一条数据的全部信息
    books = soup.findAll('div', class_='product', limit=Limit)  # 获取书本ID
    length = len(books)
    if (length == 0): return ""    #如果搜索为空就直接返回

    for book in books:
        if (isinstance(book, bs4.element.Tag)):
            productPrice = book.findAll('p', class_='productPrice') #价格
            bookInfo.append(str(productPrice[0].text)[2:])

            productTitle = book.findAll('p', class_='productTitle')  # 名字   限制22个字
            if (len(productTitle)):
                if (len(str(productTitle[0].text.strip())) <= 22):
                    bookInfo.append(str(productTitle[0].text.strip()))
                else:
                    bookInfo.append(str(productTitle[0].text.strip())[0:22] + "...")
            else:
                bookInfo.append('')


            productStatus = book.findAll('p', class_='productStatus')  # 评价
            bookInfo.append(str(productStatus[0].text))

            productImg = book.findAll('div', class_='productImg-wrap')  # 图片
            try:
                productImg = productImg[0].a.img['src']
            except:
                productImg = productImg[0].a.img['data-ks-lazyload']
            bookInfo.append('https:' + str(productImg))

            productShopName = book.findAll('a', class_='productShop-name')  # 店名
            bookInfo.append(str(productShopName[0].text))

            bookInfo.append('天猫')

            link = book.findAll('a', class_='productImg')  # 物品详细界面
            bookInfo.append('https:'+str(link[0]['href']))

            bookInfo.append('2')

            bookInfo.append(str(book['data-id']))

            allUnivList.append(bookInfo)
            bookInfo=[]                     #清空临时列表

def tianmao(key,Limit):   #bug:当搜索不足10条时
    allUnivList=[]
    url = '<https://list.tmall.com/search_product.htm?q='+> key
    text = getHTMLText(url)
    if (text == ""): return ""  # 抓不到就返回空
    getUnivList(allUnivList, text, Limit)
    return allUnivList

京东

import bs4
import random
import json
import requests
from bs4 import BeautifulSoup

proxies = {
    'http':'<http://117.135.153.10:80>'
}

headers = {'User-Agent': 'Mozilla/5.0'}

def getHTMLText(url):   #通用 获取网页信息
    try:
        r = requests.get(url, headers=headers,proxies=proxies)   #设置代理 设置时间
        r.raise_for_status()
        if(url.find('<https://item.jd.com/>')!=-1):
            r.encoding ='gbk';
        else:r.encoding ='utf-8';
    except:
        print("搜索失败")
        return ""
    return r.text


def getPrice(url):
    try:
        r = requests.get(url, headers=headers)   #设置代理 设置时间
        r.raise_for_status()
        r.encoding ='utf-8';
        result=json.loads(r.text)
        price=result[0]['p']
    except:
        print("获取价格失败")
        return ""
    return price


def getHTMLText(url):   #通用 获取网页信息
    try:
        r = requests.get(url, headers=headers,proxies=proxies)   #设置代理 设置时间
        r.raise_for_status()
        if(url.find('<https://item.jd.com/>')!=-1):
            r.encoding ='gbk';
        else:r.encoding ='utf-8';
    except:
        print("搜索失败")
        return ""
    return r.text


def getPrice(url):
    try:
        r = requests.get(url, headers=headers)   #设置代理 设置时间
        r.raise_for_status()
        r.encoding ='utf-8';
        result=json.loads(r.text)
        price=result[0]['p']
    except:
        print("获取价格失败")
        return ""
    return price


def getUnivList(allUnivList,text,Limit):
    bookUrls = []
    bookPriceURLs = []
    bookInfo = []  # 每件商品的信息 1价格  2  名字 3 评论数量 4 店名   5图片 6 平台 7物品详细界面

    soup = BeautifulSoup(text, "html.parser")
    bookIDs = soup.findAll('li', {'class': 'gl-item'}, limit=Limit)  # 获取书本ID
    length = len(bookIDs)
    if (length == 0): return ""    #如果搜索为空就直接返回
    commentNum = soup.findAll('div', class_='p-commit', limit=Limit)  # 获取评论数量
    img = soup.findAll('div', class_='p-img', limit=Limit)  # 获取图片

    for i in range(length):
        bookUrls.append('<https://item.jd.com/>' + str(bookIDs[i].get('data-sku')) + '.html')  # 把商品的url加入数列
        bookPriceURLs.append('<https://p.3.cn/prices/mgets?skuIds=J_>' + str(bookIDs[i].get('data-sku')))  # 把商品价格的url加入列表

    for i in range(length):  # bug :获取的元素不存在时会出错 用#isinstance(div,bs4.element.Tag):  判断   没有就给个空字符串
        text = getHTMLText(bookUrls[i])
        if (text == ""): return ""  # 抓不到就返回空
        soup = BeautifulSoup(text, "html.parser")

        bookInfo.append(getPrice(bookPriceURLs[i]))  # 价格

        name = soup.findAll('div', class_='sku-name', limit=1)  # 名字(限制22个字)
        if(len(name)):
            if(len(str(name[0].text.strip()))<=22):
                bookInfo.append(str(name[0].text.strip()))
            else:
                bookInfo.append(str(name[0].text.strip())[0:22]+"...")
        else:
            bookInfo.append('')

        bookInfo.append(str(commentNum[i].text.strip()))  # 把评论数论数量加入列表

        if (str(img[i].img.get('src')) == 'None'):
            bookInfo.append('http:' + str(img[i].img.get('source-data-lazy-img')))  # 把图片加入列表
        else:
            bookInfo.append('http:' + str(img[i].img.get('src')))  # 把图片加入列表


        shopName = soup.findAll('div', class_='J-hove-wrap', limit=1)  # 店名

        if (len(shopName)):
            shopName = shopName[0].findAll('a', limit=1)  # 在任何一个标签上都可以使用 findAll()
            if (len(shopName)):
                bookInfo.append(str(shopName[0].text.strip()))
            else:
                bookInfo.append('京东自营')
        else:
            bookInfo.append('')

        bookInfo.append("京东")
        bookInfo.append(bookUrls[i])
        allUnivList.append(bookInfo)
        bookInfo=[]

def jingdong(key,allUnivList,Limit):   #bug:当搜索不足10条时
    url = '<https://search.jd.com/Search?keyword=>' + key + '&enc=utf-8&wq=' + key
    text = getHTMLText(url)
    if (text == ""): return ""  # 抓不到就返回空
    getUnivList(allUnivList, text, Limit)


def main():
    key=""
    Limit=10
    allUnivList=[]
    jingdong(key,allUnivList,20)


main()



#r.request.headers 查看请求信息
#当获取元素的属性不存在时 返回一个 None
#isinstance(div,bs4.element.Tag):  判断一个标签的是否存在


'''from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)

driver.get("<https://item.jd.com/12146215.html>")
element = driver.find_element_by_id("jd-price").text
print (element)'''

当当

import requests
from bs4 import BeautifulSoup
import bs4

proxies = {
    'http':'<http://123.59.199.100:80>'
}

headers = {'User-Agent': 'Mozilla/5.0'}


def getHTMLText(url):   #通用 获取网页信息
    try:
        r = requests.get(url, headers=headers,proxies=proxies)   #设置代理 设置时间
        r.raise_for_status()
        r.encoding ='gbk';
    except:
        print("当当搜索失败")
        return ""
    return r.text


def getUnivList(allUnivList,text,Limit):

    bookInfo = []  # 每件商品的信息 0价格  1  名字 2 评论数量   3图片     4 店名   5 平台  6物品详细界面 7筛选  8商品ID

    soup = BeautifulSoup(text, "html.parser")         #直接获取每一条数据的全部信息
    books = soup.findAll('ul', class_='bigimg',limit=1)  # 获取每一本的信息
    books = books[0].findAll('li',limit=Limit)
    length = len(books)
    if (length == 0): return ""    #如果搜索为空就直接返回

    for book in books:
        if (isinstance(book, bs4.element.Tag)):
            productPrice = book.findAll('span', class_='search_now_price') #价格
            bookInfo.append(str(productPrice[0].text))

            productTitle = book.findAll('p', class_='name')  # 名字
            if (len(productTitle)):
                if (len(str(productTitle[0].text.strip())) <= 22):
                    bookInfo.append(str(productTitle[0].text.strip()))
                else:
                    bookInfo.append(str(productTitle[0].text.strip())[0:22] + "...")
            else:
                bookInfo.append('')

            productStatus = book.findAll('p', class_='search_star_line')  # 评价
            bookInfo.append(str(productStatus[0].text))

            productImg = book.findAll('a', class_='pic')  # 图片
            link=productImg[0]['href']                      #获取物品详细界面
            try:
                productImg = productImg[0].img['data-original']
            except:
                productImg = productImg[0].img['src']
            bookInfo.append(str(productImg))

            productShopName = book.findAll('p', class_='search_shangjia')  # 店名   分自营和商家   限制22个字
            if (len(productShopName)):
                productShopName = productShopName[0].find('a')['title']
                bookInfo.append(productShopName)
            else:
                bookInfo.append('当当自营')

            bookInfo.append('当当')

            bookInfo.append(link)

            bookInfo.append('3')

            bookInfo.append(book['id'][1:])

            allUnivList.append(bookInfo)
            bookInfo = []


def dangdang(key,Limit):   #bug:当搜索不足10条时
    allUnivList=[]
    url = '<http://search.dangdang.com/?key='+key+'&act=input>'
    text = getHTMLText(url)
    if (text == ""): return ""  # 抓不到就返回空
    getUnivList(allUnivList, text, Limit)
    print(allUnivList)
    return allUnivList

孔夫子

import requests
import json
proxies = {
}
headers = {'User-Agent': 'Mozilla/5.0'}

def getHTMLText(url):   #通用 获取网页信息
    try:
        r = requests.get(url, headers=headers,proxies=proxies)   #设置代理 设置时间
        r.raise_for_status()
        r.encoding ='utf-8';
    except:
        print("孔夫子搜索失败")
        return ""
    return r.text

def getFee(userId,itemId):
    url='<http://shop.kongfz.com/book/shopsearch/getShippingFee?callback=jQuery11120008516125003493968_1545017785216&params={%22params%22:[{%22userId%22:%22'+userId+'%22,%22itemId%22:%22'+itemId+'%22}>],%22area%22:%221006000000%22}'
    text=getHTMLText(url)
    if (text == ""): return "邮费获取失败"  # 抓不到就返回空
    text=text.replace('jQuery11120008516125003493968_1545017785216(','').replace(')','')
    feeAll = json.loads(text)
    fee='快递:'+feeAll['data'][0]['fee'][0]['totalFee']
    return fee

def getUnivList(allUnivList,text,Limit):
    bookdetail = json.loads(text)
    bookInfo = []  # 每件商品的信息 0价格  1  名字 2 评论数量(邮费)   3图片     4 店名   5 平台(商品的品数)  6物品详细界面 7筛选  8商品ID

    books=bookdetail['data']['itemList']
    lenth=len(books)
    if lenth==0:
        return ""
    if(lenth>Limit):
        lenth=Limit;
    for i in range(lenth):
        bookInfo.append(float(books[i]['price']))
        bookInfo.append(str(books[i]['itemname_snippet']).replace('<b>','').replace('</b>',''))
        bookInfo.append(getFee(str(books[i]['userid']),str(books[i]['itemid'])))  #传入用户ID与商品ID去获得邮费
        if str((books[i]['imgurl']))=="":
            bookInfo.append("")
        else:
            bookInfo.append('<http://www.kfzimg.com/'+str((books[i]['imgurl>'])))
        bookInfo.append(str(books[i]['shopname'])[0:15])
        bookInfo.append(str(books[i]['qualityname']))
        bookInfo.append('<http://book.kongfz.com/'+str(books[i]['shopid'])+'/'+str(books[i]['itemid'])+'/>')
        bookInfo.append('1')
        allUnivList.append(bookInfo)
        bookInfo=[]

def kongfuzi(key,Limit):   #bug:当搜索不足10条时
    allUnivList=[]
    url = '<http://search.kongfz.com/product_result/?key='+key+'&exact=1&type=1&ajaxdata=1>'
    text = getHTMLText(url)
    if (text == ""): return ""  # 抓不到就返回空
    getUnivList(allUnivList, text, Limit)
    print(allUnivList)
    return allUnivList

kongfuzi("三体",10)
posted @ 2021-05-30 20:03  兔子翻书  阅读(123)  评论(0)    收藏  举报