爬虫

爬虫

import requests
from bs4 import BeautifulSoup
import re
from lxml import etree

# 获取网页内容
for i in range(1, 1277):  # 爬取第2,50页的数据
    url = 'http://39.106.254.162/index.php?s=/index/goods/index/id/{}.html'.format(str(i))
    response = requests.get(url)
    html = response.text
    #msg = str(html)
    selector = etree.HTML(html)

    # 获取所有商品信息
    goods_list = selector.xpath('/html/body/div[4]/div[2]/div[2]')
    for goods in goods_list:


        # 商品名称
        goods_name = goods.xpath('.//h1[@class="detail-title am-margin-bottom-xs"]/text()')[0]
        goods_name1 = goods_name.strip()


        # 商品价格
        goods_price = goods.xpath('.//b[@class="goods-price"]/text()')[0]
        goods_price1 = goods_price.strip()


        # 商品浏览量
        #goods_views = goods.xpath('.//span[@class="views"]/text()')[0]

        # 商品销量
        #goods_sales = goods.xpath('.//span[@class="sales"]/text()')[0]

        # 商品库存
        #goods_stock = goods.xpath('.//span[@class="stock"]/text()')[0]

    #class ="tm-indcon" > < span class ="tm-label" > 累计销量 < / span > < span class ="tm-count" > 10 < / span > < / div >
    # pattern = re.compile(
    #     r'<div class="goods">.*?<span class="id">(.*?)</span>.*?<h2 class="name">(.*?)</h2>.*?<span class="price">(.*?)</span>.*?<span class="views">(.*?)</span>.*?<span class="sales">(.*?)</span>.*?<span class="stock">(.*?)</span>.*?</div>',
    #     re.S)

    # pattern = re.compile(
    #     r'<div class="tm-indcon"><span class="tm-label">累计销量</span><span class="tm-count">(.*?)</span></div>',
    #     re.S)
    # print('ID:'+str(i)+' ',end='')
    # goods_count = re.findall(pattern, msg)

    # 商品名称
    #goods_name = goods.xpath('.//h2[@class="name"]/text()')[0]



    if goods_name1 == '':
        continue
    print('ID:' + str(i) + ' ', end='')
    print(goods_name1,end='')
    print()
    print(goods_price1)


    goods_id=i
    goods_name=goods_name1
    goods_price=goods_price1
    #print(msg)
    with open('goods.txt', 'a',encoding='utf-8') as f:
        f.write(f'{goods_id}\t{goods_name}\t{goods_price}\t\n')

    goods_name1 = ''
    goods_price1=''

posted @ 2023-06-10 00:18  Cuckoo~  阅读(29)  评论(0)    收藏  举报