import requests
import re
from bs4 import BeautifulSoup
import csv
import os
# 发起GET请求获取网页源码
#url = 'https://www.blackview.hk/'
# 提示用户输入网址,并进行格式验证
while True:
url = input("请输入网址: ")
pattern = r'^https?://[\w\-]+(\.[\w\-]+)+[/#?]?.*$' # 简单的网址验证正则表达式
if re.match(pattern, url):
print("正在获取数据中......")
break
else:
print("请输入有效的网址!")
response = requests.get(url)
html = response.text
# 使用正则表达式提取所有符合条件的链接
pattern = r'<a\s+(?:[^>]*?\s+)?href="/products/(\d+)"'
links = re.findall(pattern, html)
# 去重链接
unique_links = list(set(links))
# 将链接写入文件
file_path = '.\price.txt'
with open(file_path, 'w', encoding='utf-8') as file:
for link in unique_links:
file.write(f"{url}products/{link}\n")
#print('链接已保存到', file_path)
#print('3...')
# 从文件中读取链接列表
url_list = []
file_path = '.\price.txt'
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
if line: # 过滤空行和空字符
url_list.append(line)
#写空文件
data = ""
with open(file_path, "w") as file:
file.write(data)
#print("写空price.txt文件成功")
#print("请等待......!")
###这里是获取到各大类产品链接并且存储到url_list列表里面去了
#写空文件
data = ""
with open('.\details.txt', "w") as file:
file.write(data)
# 循环爬取产品型号和价格并写入文件
for url in url_list:
# 发送GET请求获取网页内容
response = requests.get(url)
html = response.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')
# 找到class为"goods-list"的标签
goods_list = soup.find(class_='goods-list')
#--------这里插入获取各分类产品a链接代码--------#
# 获取标签下所有a标签的链接并去重
links = list(set(a['href'] for a in goods_list.find_all('a') if '/products/item' in a['href']))
# 计算标签数量
num_links = len(links)
# 将标签数量和链接写入文件
with open('.\details.txt', 'a', encoding='utf-8') as file:
#file.write("标签数量: " + str(num_links) + "\n")
file.write('\n'.join(links))
file.write('\n') # 添加换行符以保持每次写入的内容独立
#print("链接已写入文件")
#----------------------#
# 获取标签下的所有内容
content = goods_list.get_text()
# 删除不包含数字的行
lines = content.split('\n')
filtered_lines = [line for line in lines if any(char.isnumeric() for char in line)]
# 将内容写入文件
with open('.\price.txt', 'a', encoding='utf-8') as file:
file.write('\n'.join(filtered_lines))
file.write('\n\n') # 写入换行符,分隔不同链接的内容
#print("产品类别与价格已写入price.txt文件")
print("链接已写入文件")
output_data = [] # 用于存储要导出的数据
with open(".\price.txt", "r") as file:
current_group = [] # 当前组的数据列表
for line in file:
if line.strip(): # 如果行内容不为空,则将其添加到当前组数据列表中
current_group.append(line.strip())
else:
output_data.append(current_group) # 将当前组添加到输出数据中
current_group = [] # 创建新的当前组数据列表
if current_group: # 处理最后一组数据
output_data.append(current_group)
# 导出为CSV文件
with open(".\price.csv", "w", newline="") as csv_file:
writer = csv.writer(csv_file)
max_size = max(len(group) for group in output_data) # 获取最大组长度,即列数
for i in range(max_size):
row = []
for group in output_data:
if i < len(group):
row.append(group[i])
else:
row.append("") # 如果该组不存在该列,则添加空字符串
writer.writerow(row)
print("导出产品型号与价格.csv文件成功!")
print("打开price.csv表格后鼠标请右击选择表格整理美化!")
# 删除 price.txt 文件
file_path = "price.txt"
if os.path.exists(file_path):
os.remove(file_path)
print(f"成功删除文件:{file_path}")
else:
print(f"文件不存在:{file_path}")
url_1 = "https://www.blackview.hk"
with open(".\details.txt", "r", encoding="utf-8") as file:
lines = file.readlines()
new_lines = [url_1 + line.strip() for line in lines]
# 对数据进行排序
new_lines.sort()
with open(".\details.txt", "w", encoding="utf-8") as file:
file.write("\n".join(new_lines))
print("URL地址已成功添加到每一行之前!")
#----------获取每一个产品的具体信息
url_list_all = []
file_path = '.\details.txt'
file_path_temp = 'temp.txt'
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
if line: # 过滤空行和空字符
url_list_all.append(line)
#写空
data = ""
cnt = 1
with open(file_path, "w") as file:
file.write(data)
for url in url_list_all:
# 发送GET请求获取网页内容
response = requests.get(url)
html = response.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')
# 找到class为"left"、"right"或者"li-tit"、"li-msg"的标签
tags = soup.find_all(class_=["left", "right", "li-tit", "li-msg", "tit", "msg", "left-box", "right-box", "left-item", "right-item"])
# 提取标签内容,并去除首尾的空字符或换行
content = [tag.get_text(strip=True) for tag in tags]
# 将内容写入文件
with open(file_path_temp, 'w', encoding='utf-8') as file:
file.write('\n'.join(content))
# 读取文件内容,并找到包含"Model"字符串的行的索引
with open(file_path_temp, 'r', encoding='utf-8') as file:
lines = file.readlines()
model_line_index = -1
for i, line in enumerate(lines):
if 'Model' in line:
i = i -1
model_line_index = i
break
# 如果找到了包含"Model"字符串的行,则将该行及其之前的内容删除
if model_line_index >= 0:
lines = lines[model_line_index+1:]
# 将处理后的内容写回文件
with open(file_path_temp, 'w', encoding='utf-8') as file:
file.writelines(lines)
with open(file_path_temp, 'r', encoding='utf-8') as file:
lines = file.readlines()
with open(file_path, 'a', encoding='utf-8') as file:
file.writelines(lines)
file.write('\n\n') # 写入换行符
print(f"产品{cnt}已写入details.txt文件")
# 递增计数器
cnt += 1
# 删除 price.txt 文件
file_path = file_path_temp
if os.path.exists(file_path_temp):
os.remove(file_path_temp)
print(f"成功删除文件:{file_path_temp}")
else:
print(f"文件不存在:{file_path_temp}")
#筛选Model首行段
def filter_data(file_path):
filtered_data = []
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
is_target_data = False
data = ''
for line in lines:
if line.strip() == 'Model':
if data:
filtered_data.append(data.strip())
filtered_data.append('')
data = ''
is_target_data = True
data += line
elif line.strip() == '':
if data and is_target_data:
filtered_data.append(data.strip())
filtered_data.append('')
data = ''
is_target_data = False
else:
if is_target_data:
data += line
if data and is_target_data:
filtered_data.append(data.strip())
with open(file_path, 'w', encoding='utf-8') as file:
for i, data in enumerate(filtered_data):
file.write(data)
if i < len(filtered_data) - 1:
file.write('\n')
file_path = 'details.txt'
filter_data(file_path)
print("筛选Model首行段完成")
# 定义输入和输出文件名
input_file = "details.txt"
output_file = "details.csv"
# 打开输入和输出文件
with open(input_file, 'r', encoding='utf-8') as file_in, open(output_file, 'w', encoding='utf-8',newline='') as file_out:
# 创建CSV写入器
writer = csv.writer(file_out)
header = ["Parameter", "Value"] # 表格的标题行
writer.writerow(header) # 写入标题行
# 分割数据并逐行处理
lines = file_in.read().splitlines()
num_lines = len(lines)
i = 0 # 当前行索引
while i < num_lines:
if lines[i] == "Model":
model = lines[i + 1]
# 查找下一个Model行或到达文件末尾
j = i + 2#2 to 1
while j < num_lines and lines[j] != "Model":
j += 1
# 提取当前数据段信息
data = []
while i + 1 < j:
parameter = lines[i].strip()
value = lines[i + 1].strip()
data.append([parameter, value])
i += 2#这里2改为1
# 写入空行分隔不同段的数据
if len(data) > 0:
#writer.writerow(["Model", model])
writer.writerows(data)
writer.writerow([]) # 写入空行
else:
i += 1 # 跳过非Model行
print("数据已成功提取并保存到details.csv文件中。")
# 删除 details.txt 文件
if os.path.exists(input_file):
os.remove(input_file)
print(f"成功删除文件:{input_file}")
else:
print(f"文件不存在:{input_file}")