爬虫
import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
# 获取网页内容
for i in range(1, 1277): # 爬取第2,50页的数据
url = 'http://39.106.254.162/index.php?s=/index/goods/index/id/{}.html'.format(str(i))
response = requests.get(url)
html = response.text
#msg = str(html)
selector = etree.HTML(html)
# 获取所有商品信息
goods_list = selector.xpath('/html/body/div[4]/div[2]/div[2]')
for goods in goods_list:
# 商品名称
goods_name = goods.xpath('.//h1[@class="detail-title am-margin-bottom-xs"]/text()')[0]
goods_name1 = goods_name.strip()
# 商品价格
goods_price = goods.xpath('.//b[@class="goods-price"]/text()')[0]
goods_price1 = goods_price.strip()
# 商品浏览量
#goods_views = goods.xpath('.//span[@class="views"]/text()')[0]
# 商品销量
#goods_sales = goods.xpath('.//span[@class="sales"]/text()')[0]
# 商品库存
#goods_stock = goods.xpath('.//span[@class="stock"]/text()')[0]
#class ="tm-indcon" > < span class ="tm-label" > 累计销量 < / span > < span class ="tm-count" > 10 < / span > < / div >
# pattern = re.compile(
# r'<div class="goods">.*?<span class="id">(.*?)</span>.*?<h2 class="name">(.*?)</h2>.*?<span class="price">(.*?)</span>.*?<span class="views">(.*?)</span>.*?<span class="sales">(.*?)</span>.*?<span class="stock">(.*?)</span>.*?</div>',
# re.S)
# pattern = re.compile(
# r'<div class="tm-indcon"><span class="tm-label">累计销量</span><span class="tm-count">(.*?)</span></div>',
# re.S)
# print('ID:'+str(i)+' ',end='')
# goods_count = re.findall(pattern, msg)
# 商品名称
#goods_name = goods.xpath('.//h2[@class="name"]/text()')[0]
if goods_name1 == '':
continue
print('ID:' + str(i) + ' ', end='')
print(goods_name1,end='')
print()
print(goods_price1)
goods_id=i
goods_name=goods_name1
goods_price=goods_price1
#print(msg)
with open('goods.txt', 'a',encoding='utf-8') as f:
f.write(f'{goods_id}\t{goods_name}\t{goods_price}\t\n')
goods_name1 = ''
goods_price1=''