Python爬虫爬取html中div下的多个class标签并存入数据库

使用python爬虫爬取html页面div中的多个class标签,获取后将数据存成列表,然后存入数据库

import mysql.connector
import pymysql
import requests
from bs4 import BeautifulSoup

# Connect to the database
conn = mysql.connector.connect(user='root', password='123456', host='127.0.0.1', database='listdb')
cursor = conn.cursor()

# Create table if not exists
cursor.execute('''CREATE TABLE IF NOT EXISTS data (
    id INT AUTO_INCREMENT PRIMARY KEY,
    date VARCHAR(255),
    text VARCHAR(255)
)''')
conn.commit()

# Sample data
url = "https://url/"
response = requests.get(url)
html_content = response.text

# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(html_content, "html.parser")

# 获取 div 元素下多个 class 的数据
data = []
for div in soup.find_all("div", class_=["post-meta", "post-title"]):
    content = div.text.strip()
    data.append(content)


# Iterate over the data
for i in range(0, len(data), 2):
    date = data[i]
    text = data[i + 1]

    # Check if the data already exists
    cursor.execute(f"SELECT id FROM data WHERE date='{date}' AND text='{text}'")
    result = cursor.fetchone()

    # Insert if it does not exist
    if not result:
        cursor.execute(f"INSERT INTO data (date, text) VALUES ('{date}', '{text}')")

# Commit and close the connection
conn.commit()
conn.close()

 

posted @ 2023-02-09 11:14  Old·Artist  阅读(1051)  评论(0)    收藏  举报