【项目一】2.blackview_分而治之

1.获取1层url：

import requests
import re

# 发起GET请求获取网页源码
url = 'https://www.blackview.hk/'
response = requests.get(url)
html = response.text

# 使用正则表达式提取所有符合条件的链接
pattern = r'<a\s+(?:[^>]*?\s+)?href="/products/(\d+)"'
links = re.findall(pattern, html)

# 去重链接
unique_links = list(set(links))

# 将链接写入文件
file_path = 'F:/url-1.txt'
with open(file_path, 'w') as file:
    for link in unique_links:
        file.write(f"{url}products/{link}\n")

print('链接已保存到', file_path)

2.获取1层特定url的price和model：

import requests
from bs4 import BeautifulSoup

url = 'https://www.blackview.hk/products/43'

# 发送GET请求获取网页内容
response = requests.get(url)
html = response.text

# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')

# 找到class为"goods-list"的标签
goods_list = soup.find(class_='goods-list')

# 获取标签下的所有内容
content = goods_list.get_text()

# 删除不包含数字的行
lines = content.split('\n')
filtered_lines = [line for line in lines if any(char.isnumeric() for char in line)]

# 将内容写入文件
with open('F:\\price.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join(filtered_lines))

print("网页源码已写入price.txt文件")

3.获取1层特定url的所有a标签：

import requests
from bs4 import BeautifulSoup

url = 'https://www.blackview.hk/products/58'

# 发送GET请求获取网页内容
response = requests.get(url)
html = response.text

# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')

# 找到class为"goods-list"的标签
goods_list = soup.find(class_='goods-list')

# 获取标签下所有a标签的链接并去重
links = list(set(a['href'] for a in goods_list.find_all('a') if '/products/item' in a['href']))

# 计算标签数量
num_links = len(links)

# 将标签数量和链接写入文件
with open('F:\\url-2.txt', 'w', encoding='utf-8') as file:
    file.write("标签数量: " + str(num_links) + "\n")
    file.write('\n'.join(links))

print("链接已写入url-2.txt文件")

4.获取2层的产品具体信息：

import requests
from bs4 import BeautifulSoup

url = 'https://www.blackview.hk/products/item/tab6'
file_path = 'F:\\url.txt'
# 发送GET请求获取网页内容
response = requests.get(url)
html = response.text

# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')

# 找到class为"left"、"right"或者"li-tit"、"li-msg"的标签
tags = soup.find_all(class_=["left", "right", "li-tit", "li-msg"])

# 提取标签内容，并去除首尾的空字符或换行
content = [tag.get_text(strip=True) for tag in tags]

# 将内容写入文件
with open(file_path, 'w', encoding='utf-8') as file:
    file.write('\n'.join(content))

# 读取文件内容，并找到包含"Model"字符串的行的索引
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()
model_line_index = -1
for i, line in enumerate(lines):
    if 'Model' in line:
        i = i -1
        model_line_index = i
        break

# 如果找到了包含"Model"字符串的行，则将该行及其之前的内容删除
if model_line_index >= 0:
    lines = lines[model_line_index+1:]

# 将处理后的内容写回文件
with open(file_path, 'w', encoding='utf-8') as file:
    file.writelines(lines)

print("内容已写入url.txt文件")

5.读取2层数据保存到csv表格：

import pandas as pd
import os.path

# 从文件中读取数据
with open('F:\\url.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 创建一个空的表格
table = pd.DataFrame(columns=['Parameter', 'Value'])

# 检查是否已存在输出文件
output_file_exists = os.path.isfile('output.csv')

# 解析每一行的数据并填充到表格中
for i in range(0, len(lines), 2):  # 每次跳过两行
    if i + 1 < len(lines):  # 检查行数是否足够
        parameter = lines[i].strip()
        value = lines[i+1].strip()
        table = pd.concat([table, pd.DataFrame({'Parameter': [parameter], 'Value': [value]})], ignore_index=True)
    else:
        # 处理行数不足的情况，可以添加默认值或进行其他处理
        parameter = lines[i].strip()
        value = ''  # 添加空值作为默认值
        table = pd.concat([table, pd.DataFrame({'Parameter': [parameter], 'Value': [value]})], ignore_index=True)

# 保存表格为CSV文件（追加写入）
if output_file_exists:
    # 如果输出文件已存在，追加数据到文件，并在追加前添加一行空行
    with open('output.csv', 'a', encoding='utf-8') as file:
        file.write('\n\n')
        table.to_csv(file, header=False, index=False)
else:
    # 如果输出文件不存在，直接保存表格为CSV文件
    table.to_csv('output.csv', index=False)
print("ok")

posted @ 2023-09-08 18:08 $KAMISAMALZ 阅读(24) 评论(0) 收藏举报

刷新页面返回顶部

$Show me the code

——纯粹、极致、慢热、深邃、疯狂——

【项目一】2.blackview_分而治之

1.获取1层url：

2.获取1层特定url的price和model：

3.获取1层特定url的所有a标签：

4.获取2层的产品具体信息：

5.读取2层数据保存到csv表格：

公告