数据合并清洗相关代码02

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from jieba import analyse
import pymysql as sql
from tqdm import tqdm  # 添加进度条


def load_and_clean_data(file_paths):
    """加载并清理数据"""
    dfs = []
    for path in file_paths:
        try:
            df = pd.read_csv(path, encoding='utf-8', dtype='str')
            df.drop(df.columns[0], axis=1, inplace=True)  # 删除第一列索引
            dfs.append(df)
        except Exception as e:
            print(f"加载文件 {path} 失败: {str(e)}")
    return pd.concat(dfs, ignore_index=True)


def merge_similar_columns(df):
    """合并相似列"""
    # 简介合并
    df['简介'] = df.get('项目简介', pd.Series()).combine_first(
        df.get('成果简介', pd.Series()))
    df.drop(['项目简介', '成果简介'], axis=1, errors='ignore', inplace=True)

    # 单位合并
    df['单位'] = df.get('单位名称', pd.Series()).combine_first(
        df.get('单位', pd.Series())).combine_first(
        df.get('完成单位', pd.Series())).combine_first(
        df.get('成果产出单位', pd.Series()))
    df.drop(['单位名称', '完成单位', '成果产出单位'], axis=1, errors='ignore', inplace=True)

    # 其他字段合并...
    return df


def extract_keywords(df):
    """从简介中提取关键词"""
    df['简介'] = df['简介'].fillna('')
    keywords_list = []
    for text in tqdm(df['简介'], desc="提取关键词"):
        try:
            keywords = analyse.extract_tags(str(text)) if text else []
            keywords_list.append(", ".join(keywords))
        except Exception as e:
            print(f"关键词提取出错: {str(e)}")
            keywords_list.append("")
    df['关键词'] = keywords_list
    return df


def save_to_database(df, db_config):
    """保存到数据库"""
    try:
        conn = sql.connect(**db_config)
        cursor = conn.cursor()

        # 生成表结构
        columns = df.columns
        define_type = ",\n".join([f"{col} text null" for col in columns])

        # SQL语句
        drop_table_query = 'DROP TABLE IF EXISTS combination_table;'
        create_table_query = f'CREATE TABLE combination_table ({define_type});'
        insert_query = f'INSERT INTO combination_table ({", ".join(columns)}) VALUES ({", ".join(["%s"] * len(columns))});'

        # 执行SQL
        cursor.execute(drop_table_query)
        cursor.execute(create_table_query)

        # 批量插入
        data_tuples = [tuple(x) for x in df.to_numpy()]
        cursor.executemany(insert_query, data_tuples)
        conn.commit()

        print(f"成功插入 {len(data_tuples)} 条数据")
    except Exception as e:
        print(f"数据库操作失败: {str(e)}")
    finally:
        cursor.close()
        conn.close()


def main():
    # 配置
    file_paths = [
        r'F:\ruo项目\数据\cg_2015年第1-2-3期.csv',
        r'F:\ruo项目\数据\cg_天津科技成果.csv',
        r'F:\ruo项目\数据\科技成果.csv'
        r'F:\ruo项目\数据\1.4北京推介项目成果.csv'
    ]
    db_config = {
        'host': 'localhost',
        'user': 'root',
        'password': '123456',
        'database': 'world',
        'charset': 'utf8'
    }

    # 数据处理流程
    print("开始处理数据...")
    df = load_and_clean_data(file_paths)
    df = merge_similar_columns(df)
    df = extract_keywords(df)

    # 填充缺失值
    df = df.fillna("null")
    df.insert(0, '序号', range(1, len(df) + 1))

    # 保存结果
    df.to_csv('combination.csv', index=False, encoding='utf-8-sig')
    save_to_database(df, db_config)
    print("数据处理完成!")


if __name__ == "__main__":
    main()

 

posted @ 2025-06-14 13:27  艾鑫4646  阅读(10)  评论(0)    收藏  举报