import os,xlrd,openpyxl
import time
import re
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.service import Service
cwd = os.getcwd()
date = time.strftime('%Y.%m.%d',time.localtime(time.time())) # 获取当前日期
filepath = '商品链接状态'+date+'.xlsx' # 新建以当前日期命名表格
wb1 = openpyxl.Workbook()
ws1 = wb1.active
wb1.save(filepath)
path = cwd + '\\商品列表.xlsx'
path1 = cwd + '\\'+'商品链接状态'+date+'.xlsx' # 新建表格的路径
# print(path1)
def down_data(url):
#https://www.cnblogs.com/muchengnanfeng/p/9553186.html
#ChromeDriver是轻量级的服务,在单任务或不需要频繁启动浏览器的情况下,使用driver.quit()关闭浏览器,可以正常结束ChromeDriver进程。若在一个比较大的 测试套件中频繁的启动关闭,会增加一个比较明显的延时导致浏览器进程不被关闭的情况发生,为了避免这一状况我们可以通过ChromeDriverService来控制ChromeDriver进程的生死,达到用完就关闭的效果避免进程占用情况出现(Running the server in a child process)
c_service = Service(r'D:\Python\Scripts\chromedriver.exe')
c_service.command_line_args()
c_service.start()
# 加载浏览器带表头数据爬虫
profile_directory = r'--user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data'
option = webdriver.ChromeOptions()
# option.add_argument('--no-sandbox')
# option.add_argument('--disable-dev-shm-usage')
# option.add_argument('--headless')
# option.add_argument('headless')
option.add_argument(profile_directory)
driver = webdriver.Chrome(options=option)
driver.get(url)
sleep(3)
data = driver.page_source
# print(data)
sleep(2)
driver.quit()
sleep(1)
c_service.stop()
return data
def down_data1(url):
data = down_data(url)
if len(re.findall('class="do-purchase ms-yh " title=".*?" rel="nofollow"><span>(.*?)</span></a>', data, re.S)):
if re.findall('class="do-purchase ms-yh " title=".*?" rel="nofollow"><span>(.*?)</span></a>', data, re.S)[0] == "立即订购":
sation = "商品有效"
print(sation)
else:
if len(re.findall('<em class="hightlight">(.*?)</em>', data, re.S)):
if re.findall('<em class="hightlight">(.*?)</em>', data, re.S)[0] == "Error 404":
sation = "抱歉,您要访问的页面不存在"
print(sation)
else:
sation = "商品下架"
print(sation)
return sation
def down_data2(url):
data = down_data(url)
if len(re.findall('class="J_LinkBuy" shortcut-key=".*?" shortcut-label="(.*?)"', data, re.S)):
if re.findall('class="J_LinkBuy" shortcut-key=".*?" shortcut-label="(.*?)"', data, re.S)[0] == "立即购买":
sation = "商品有效"
print(sation)
else:
if len(re.findall('<div class="error-notice-hd">(.*?),', data, re.S)):
if re.findall('<div class="error-notice-hd">(.*?),', data, re.S)[0] == "很抱歉":
sation = "抱歉,您要访问的页面不存在"
print(sation)
else:
sation = "商品下架"
print(sation)
return sation
# 读取商品列表中的所有sku和链接
wb = xlrd.open_workbook(path)
ws = wb.sheets()[0]
sku = []
url = []
for i in range(ws.nrows):
row = ws.row_values(i)
sku.append(row[0].split('-')[0])
url.append(row[13])
sku1 = []
url1 = []
sation1 = []
sku_url =[]
for i in zip(sku,url):
sku_url.append(i)
print(len(sku_url),sku_url)
sku_urls = list(set(sku_url)) # 唯一的sku
print(len(sku_urls),sku_urls)
sationa = '商品有效'
sationb = '抱歉,您要访问的页面不存在'
sationc = '商品下架'
for i in range(len(sku_urls)):
string = sku_urls[i][1] # 一个sku下的所有链接
zurl = string.split('|')
for j in range(0,len(zurl)): # 循环截取后的链接
if zurl[j][0:14] == 'https://detail':
sation = down_data1(zurl[j])
if sation == "商品有效":
sku1.append(sku_urls[i][0])
url1.append(zurl[j])
sation1.append(sationa)
elif sation == "抱歉,您要访问的页面不存在":
sku1.append(sku_urls[i][0])
url1.append(zurl[j])
sation1.append(sationb)
else:
sku1.append(sku_urls[i][0])
url1.append(zurl[j])
sation1.append(sationc)
elif zurl[j][0:12] == 'https://item':
sation = down_data2(zurl[j])
if sation == "商品有效":
sku1.append(sku_urls[i][0])
url1.append(zurl[j])
sation1.append(sationa)
elif sation == "抱歉,您要访问的页面不存在":
sku1.append(sku_urls[i][0])
url1.append(zurl[j])
sation1.append(sationb)
else:
sku1.append(sku_urls[i][0])
url1.append(zurl[j])
sation1.append(sationc)
ws1.cell(1,1).value = "sku"
ws1.cell(1,2).value = "商品链接"
ws1.cell(1,3).value = "链接状态"
for i in range(1,len(sku1)+1):
ws1.cell(i+1, 1).value = sku1[i-1]
ws1.cell(i+1, 2).value = url1[i-1]
ws1.cell(i+1, 3).value = sation1[i-1]
wb1.save(path1)