# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from jieba import analyse
import pymysql as sql
from tqdm import tqdm # 添加进度条
def load_and_clean_data(file_paths):
"""加载并清理数据"""
dfs = []
for path in file_paths:
try:
df = pd.read_csv(path, encoding='utf-8', dtype='str')
df.drop(df.columns[0], axis=1, inplace=True) # 删除第一列索引
dfs.append(df)
except Exception as e:
print(f"加载文件 {path} 失败: {str(e)}")
return pd.concat(dfs, ignore_index=True)
def merge_similar_columns(df):
"""合并相似列"""
# 简介合并
df['简介'] = df.get('项目简介', pd.Series()).combine_first(
df.get('成果简介', pd.Series()))
df.drop(['项目简介', '成果简介'], axis=1, errors='ignore', inplace=True)
# 单位合并
df['单位'] = df.get('单位名称', pd.Series()).combine_first(
df.get('单位', pd.Series())).combine_first(
df.get('完成单位', pd.Series())).combine_first(
df.get('成果产出单位', pd.Series()))
df.drop(['单位名称', '完成单位', '成果产出单位'], axis=1, errors='ignore', inplace=True)
# 其他字段合并...
return df
def extract_keywords(df):
"""从简介中提取关键词"""
df['简介'] = df['简介'].fillna('')
keywords_list = []
for text in tqdm(df['简介'], desc="提取关键词"):
try:
keywords = analyse.extract_tags(str(text)) if text else []
keywords_list.append(", ".join(keywords))
except Exception as e:
print(f"关键词提取出错: {str(e)}")
keywords_list.append("")
df['关键词'] = keywords_list
return df
def save_to_database(df, db_config):
"""保存到数据库"""
try:
conn = sql.connect(**db_config)
cursor = conn.cursor()
# 生成表结构
columns = df.columns
define_type = ",\n".join([f"{col} text null" for col in columns])
# SQL语句
drop_table_query = 'DROP TABLE IF EXISTS combination_table;'
create_table_query = f'CREATE TABLE combination_table ({define_type});'
insert_query = f'INSERT INTO combination_table ({", ".join(columns)}) VALUES ({", ".join(["%s"] * len(columns))});'
# 执行SQL
cursor.execute(drop_table_query)
cursor.execute(create_table_query)
# 批量插入
data_tuples = [tuple(x) for x in df.to_numpy()]
cursor.executemany(insert_query, data_tuples)
conn.commit()
print(f"成功插入 {len(data_tuples)} 条数据")
except Exception as e:
print(f"数据库操作失败: {str(e)}")
finally:
cursor.close()
conn.close()
def main():
# 配置
file_paths = [
r'F:\ruo项目\数据\cg_2015年第1-2-3期.csv',
r'F:\ruo项目\数据\cg_天津科技成果.csv',
r'F:\ruo项目\数据\科技成果.csv'
r'F:\ruo项目\数据\1.4北京推介项目成果.csv'
]
db_config = {
'host': 'localhost',
'user': 'root',
'password': '123456',
'database': 'world',
'charset': 'utf8'
}
# 数据处理流程
print("开始处理数据...")
df = load_and_clean_data(file_paths)
df = merge_similar_columns(df)
df = extract_keywords(df)
# 填充缺失值
df = df.fillna("null")
df.insert(0, '序号', range(1, len(df) + 1))
# 保存结果
df.to_csv('combination.csv', index=False, encoding='utf-8-sig')
save_to_database(df, db_config)
print("数据处理完成!")
if __name__ == "__main__":
main()