一、     数据结构分析:

(1)京津冀三省的2015年度的科技成果数据原始表,为Access数据库,;

 

(2)要求将三省的科技成果数据汇总到同一表中(要求结果表为MySql数据表);

(3)三个原始数据表结构不一致,要求结果表中包括所有的字段,表达意思相同或相似的字段要进行合并,不允许丢失字段(若只有本表独有字段,在结果表中其他两表数据在该字段填入空值)。

二、     数据同步练习:要求采编程实现三个原始表数据同步功能,将三个表的数据同步到一个结果表中。

三、    数据清洗练习:

(1)重复记录清洗,分析结果表中是否存在重复的数据记录,主要是地域和成果名称相同即判定为重复记录,保留一条记录,并补充其他重复记录中独有的数据字段内容,再删除其余记录。

(2)在结果表中追加年份和地域两个标准维度字段,如果原始表中存在该字段则直接转化成维度字段,若不存在则根据单位名称确定地域字段内容,天津科技成果表中不存在年度字段,则直接将年度维度字段确定为2015年。

# -*- coding: utf-8 -*-
import pandas as pd
#  从excel中读取数据,第一行为列名
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# import sys
# from sentence_transformers.util import cos_sim
# from sentence_transformers import SentenceTransformer as SBert

data_source_1 = pd.read_csv('tc1.csv', encoding='utf-8', dtype='str')
data_source_2 = pd.read_csv('tc2.csv', encoding='utf-8', dtype='str')
data_source_3 = pd.read_csv('tc3.csv', encoding='utf-8', dtype='str')
data_source_4 = pd.read_csv('tc4.csv', encoding='utf-8', dtype='str')

# 第一列的索引字段
data_source_1.drop(data_source_1.columns[0], axis=1, inplace=True)
data_source_2.drop(data_source_2.columns[0], axis=1, inplace=True)
data_source_3.drop(data_source_3.columns[0], axis=1, inplace=True)
data_source_4.drop(data_source_4.columns[0], axis=1, inplace=True)
# inplace=True == 指定在原始DataFrame上进行修改,而不是返回一个新的DataFrame
# print(data_source_1.head(10))
# print(data_source_2.head(10))
# print(data_source_3.head(10))
# print(data_source_4.head(10))

# 相同字段合并
combination_df = pd.concat([data_source_1, data_source_2, data_source_3, data_source_4])

cnt = 0
for x in range(0, len(combination_df.columns.tolist()) - 1):
    for y in range(x + 1, len(combination_df.columns.tolist())):
        if combination_df.columns.tolist()[x] == combination_df.columns.tolist()[y]:
            print(combination_df.columns.tolist()[x])
# 合并相似字段
combination_df['简介'] = combination_df['项目简介'].combine_first(combination_df['成果简介'])
combination_df.drop(['项目简介', '成果简介'], axis=1, inplace=True)

# 从简介中提取关键词
from jieba import analyse

introduce = combination_df[['简介']]
introduce = introduce.fillna(0)
introduce_array = np.array(introduce.stack())  # 首先将pandas读取的数据转化为array
data_list_1 = introduce_array.tolist()  # 然后转化为list形式
data_list_2 = {}
for i in range(len(data_list_1)):
    tfidf = analyse.extract_tags
    if tfidf(data_list_1[i]) == None:
        data_list_2[i] = None
    else:
        data_list_2[i] = tfidf(str(data_list_1[i]))
list = []
for i in range(1, len(data_list_2)):
    result = ""
    for j in range(1, len(data_list_2[i])):
        result += str(data_list_2[i][j])
    list.append(result)
dd = pd.DataFrame(list)
dd.fillna("0")
combination_df['关键词'] = dd

combination_df['单位'] = combination_df['单位名称'].combine_first(combination_df['单位']).combine_first(
    combination_df['完成单位']).combine_first(combination_df['成果产出单位'])
combination_df.drop(['单位名称', '完成单位', '成果产出单位'], axis=1, inplace=True)

combination_df['应用行业'] = combination_df['应用行业名称'].combine_first(combination_df['行业'])
combination_df.drop(['应用行业名称', '行业'], axis=1, inplace=True)

combination_df['知识产权'] = combination_df['取得的知识产权'].combine_first(combination_df['知识产权情况'])
combination_df.drop(['取得的知识产权', '知识产权情况'], axis=1, inplace=True)

# 标准化维度 '年度'
combination_df['年度'] = combination_df['年度'].combine_first(combination_df['成果分布年份'])
combination_df.drop(['成果分布年份'], axis=1, inplace=True)

# 清除重复数据
# 检测基于'成果名称'和'省市'列的重复行
duplicate_rows = combination_df[combination_df.duplicated(subset=['成果名称', '省市'], keep=False)]

# 打印重复行
if len(duplicate_rows) > 0:
    print("清除的重复数据行:")
    print(duplicate_rows)
else:
    print('无重复数据')

# 添加标准化维度 '年度'
combination_df['年度'] = combination_df['年度']
# 添加标准化维度 '地域'
# 直接使用'省市'作为地域
combination_df['地域'] = combination_df['省市'].combine_first(combination_df['联系单位地址'])
combination_df.drop(['省市', '联系单位地址'], axis=1, inplace=True)
# 将 DataFrame 中的 NaN 值替换为 None
# combination_df = combination_df.where(pd.notnull(combination_df), None)
# combination_df.info()
# '年度' 列中的缺失值用 2015 替换
combination_df['年度'] = combination_df['年度'].fillna('2015')

combination_df = combination_df.fillna("null")
combination_df.insert(0, '序号', range(1, len(combination_df) + 1))

define_type = '\n'
for column in combination_df.columns:
    if column != combination_df.columns[len(combination_df.columns) - 1]:
        define_type = define_type + column + ' text null,\n'
    else:
        define_type = define_type + column + ' text null \n'

str_columns = ''
str_s = ''

for column in combination_df.columns:
    if column != combination_df.columns[len(combination_df.columns) - 1]:
        str_columns = str_columns + column + ', '
        str_s = str_s + '%s, '
    else:
        str_columns = str_columns + column
        str_s = str_s + '%s'

print(str_columns)
print(str_s)
# 写入数据库
import pymysql as sql

conn = sql.connect(host='localhost', user='root', password='20240108', database='usr', charset='utf8')
cursor = conn.cursor()

drop_table_query = 'drop table if exists combination_table;'
create_table_query = f'create table if not exists combination_table({define_type});'
insert_table_query = f'insert into combination_table({str_columns}) values ({str_s});'
print(drop_table_query)
print(create_table_query)
print(insert_table_query)
data_tuples = [tuple(x) for x in combination_df.to_numpy()]
# print(data_tuples)
try:
    # 将 DataFrame 转换为元组列表
    data_tuples = [tuple(x) for x in combination_df.to_numpy()]

    cursor.execute(drop_table_query)
    cursor.execute(create_table_query)
    for i in range(len(data_tuples)):
        cursor.execute(insert_table_query, data_tuples[i])
        conn.commit()
    print("SQL statements executed successfully.")
except Exception as e:
    print(f"Error executing SQL statements: {str(e)}")
finally:
    cursor.close()
    conn.close()

# 将DataFrame写入CSV文件
combination_df.to_csv('combination.csv', index=False)

 

posted on 2024-04-09 17:43  XiSoil  阅读(29)  评论(0)    收藏  举报