import re
import pymysql
import requests
from bs4 import BeautifulSoup
import lxml
import sys, io
import openpyxl
# 数据库信息
host = '192.168.56.101'
username = "root"
passwd = "123456"
database = "test"
port = 3306
charset = "utf8"
excel_name = input("输入Excel表格名称:") + ".xlsx"
# 连接数据库创建表
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='UTF8')
db = pymysql.connect(host=host, user=username, password=passwd, database=database)
cursor = db.cursor()
cursor.execute("drop table if exists test001")
sql = 'CREATE TABLE test001(ID int primary key auto_increment,title char(20),amouts char(255))'
cursor.execute(sql)
# 函数 创建文本文档
def mk_file(name):
return name + ".txt"
# 函数,插入数据
def insert(value):
db = pymysql.connect(host=host, user=username, password=passwd, database=database, charset="utf8")
cursor = db.cursor()
sql = "insert into test001(title,amouts) values(%s,%s)"
try:
cursor.execute(sql, value)
db.commit()
print("插入数据成功")
except Exception as e:
db.rollback()
print(e)
print("插入数据失败")
# 函数 生成Excel表格
def wr_Excel(table_name):
# 连数据库,读取数据
conn = pymysql.connect(host=host, user=username, password=passwd, database=database, port=3306, charset="utf8")
cur = conn.cursor()
sql = "select * from %s;" % table_name
cur.execute(sql)
# 使用%s占位符可以占位where条件,但是不能占位表名
# 实际为了防止sql注入,在where条件使用占位符通常使用下述方法
# sql = "select * from table_name where %s;"(此时假如要输入的变量为age)
# cur.execute(sql,age)
sql_result = cur.fetchall()
cur.close()
conn.close()
# 写Excel
book = openpyxl.Workbook()
sheet = book.active
fff = [filed[0] for filed in cur.description] # 获取表头信息
sheet.append(fff)
# 进行流程
newtext = mk_file("text")
data_file = open(newtext, 'w', encoding="UTF8")
with open('C:\\Users\\35548\Desktop\\百度产品大全.html', 'r', encoding='UTF8') as web_data:
Soup = BeautifulSoup(web_data, 'lxml')
titles = Soup.select('#content > div > div >a')
amouts = Soup.select('#content > div > div >span')
for titles, amouts in zip(titles, amouts):
data = {
'titles': titles.get_text(),
'amouts': amouts.get_text()
}
data_str = str(data)
data_file.write(data_str)
insert((titles.get_text(), amouts.get_text()))
wr_Excel('test001')