一、 数据结构分析:
(1)京津冀三省的2015年度的科技成果数据原始表,为Access数据库,;
(2)要求将三省的科技成果数据汇总到同一表中(要求结果表为MySql数据表);
(3)三个原始数据表结构不一致,要求结果表中包括所有的字段,表达意思相同或相似的字段要进行合并,不允许丢失字段(若只有本表独有字段,在结果表中其他两表数据在该字段填入空值)。
二、 数据同步练习:要求采编程实现三个原始表数据同步功能,将三个表的数据同步到一个结果表中。
三、 数据清洗练习:
(1)重复记录清洗,分析结果表中是否存在重复的数据记录,主要是地域和成果名称相同即判定为重复记录,保留一条记录,并补充其他重复记录中独有的数据字段内容,再删除其余记录。
(2)在结果表中追加年份和地域两个标准维度字段,如果原始表中存在该字段则直接转化成维度字段,若不存在则根据单位名称确定地域字段内容,天津科技成果表中不存在年度字段,则直接将年度维度字段确定为2015年。
# -*- coding: utf-8 -*- import pandas as pd # 从excel中读取数据,第一行为列名 import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer # import sys # from sentence_transformers.util import cos_sim # from sentence_transformers import SentenceTransformer as SBert data_source_1 = pd.read_csv('tc1.csv', encoding='utf-8', dtype='str') data_source_2 = pd.read_csv('tc2.csv', encoding='utf-8', dtype='str') data_source_3 = pd.read_csv('tc3.csv', encoding='utf-8', dtype='str') data_source_4 = pd.read_csv('tc4.csv', encoding='utf-8', dtype='str') # 第一列的索引字段 data_source_1.drop(data_source_1.columns[0], axis=1, inplace=True) data_source_2.drop(data_source_2.columns[0], axis=1, inplace=True) data_source_3.drop(data_source_3.columns[0], axis=1, inplace=True) data_source_4.drop(data_source_4.columns[0], axis=1, inplace=True) # inplace=True == 指定在原始DataFrame上进行修改,而不是返回一个新的DataFrame # print(data_source_1.head(10)) # print(data_source_2.head(10)) # print(data_source_3.head(10)) # print(data_source_4.head(10)) # 相同字段合并 combination_df = pd.concat([data_source_1, data_source_2, data_source_3, data_source_4]) cnt = 0 for x in range(0, len(combination_df.columns.tolist()) - 1): for y in range(x + 1, len(combination_df.columns.tolist())): if combination_df.columns.tolist()[x] == combination_df.columns.tolist()[y]: print(combination_df.columns.tolist()[x]) # 合并相似字段 combination_df['简介'] = combination_df['项目简介'].combine_first(combination_df['成果简介']) combination_df.drop(['项目简介', '成果简介'], axis=1, inplace=True) # 从简介中提取关键词 from jieba import analyse introduce = combination_df[['简介']] introduce = introduce.fillna(0) introduce_array = np.array(introduce.stack()) # 首先将pandas读取的数据转化为array data_list_1 = introduce_array.tolist() # 然后转化为list形式 data_list_2 = {} for i in range(len(data_list_1)): tfidf = analyse.extract_tags if tfidf(data_list_1[i]) == None: data_list_2[i] = None else: data_list_2[i] = tfidf(str(data_list_1[i])) list = [] for i in range(1, len(data_list_2)): result = "" for j in range(1, len(data_list_2[i])): result += str(data_list_2[i][j]) list.append(result) dd = pd.DataFrame(list) dd.fillna("0") combination_df['关键词'] = dd combination_df['单位'] = combination_df['单位名称'].combine_first(combination_df['单位']).combine_first( combination_df['完成单位']).combine_first(combination_df['成果产出单位']) combination_df.drop(['单位名称', '完成单位', '成果产出单位'], axis=1, inplace=True) combination_df['应用行业'] = combination_df['应用行业名称'].combine_first(combination_df['行业']) combination_df.drop(['应用行业名称', '行业'], axis=1, inplace=True) combination_df['知识产权'] = combination_df['取得的知识产权'].combine_first(combination_df['知识产权情况']) combination_df.drop(['取得的知识产权', '知识产权情况'], axis=1, inplace=True) # 标准化维度 '年度' combination_df['年度'] = combination_df['年度'].combine_first(combination_df['成果分布年份']) combination_df.drop(['成果分布年份'], axis=1, inplace=True) # 清除重复数据 # 检测基于'成果名称'和'省市'列的重复行 duplicate_rows = combination_df[combination_df.duplicated(subset=['成果名称', '省市'], keep=False)] # 打印重复行 if len(duplicate_rows) > 0: print("清除的重复数据行:") print(duplicate_rows) else: print('无重复数据') # 添加标准化维度 '年度' combination_df['年度'] = combination_df['年度'] # 添加标准化维度 '地域' # 直接使用'省市'作为地域 combination_df['地域'] = combination_df['省市'].combine_first(combination_df['联系单位地址']) combination_df.drop(['省市', '联系单位地址'], axis=1, inplace=True) # 将 DataFrame 中的 NaN 值替换为 None # combination_df = combination_df.where(pd.notnull(combination_df), None) # combination_df.info() # '年度' 列中的缺失值用 2015 替换 combination_df['年度'] = combination_df['年度'].fillna('2015') combination_df = combination_df.fillna("null") combination_df.insert(0, '序号', range(1, len(combination_df) + 1)) define_type = '\n' for column in combination_df.columns: if column != combination_df.columns[len(combination_df.columns) - 1]: define_type = define_type + column + ' text null,\n' else: define_type = define_type + column + ' text null \n' str_columns = '' str_s = '' for column in combination_df.columns: if column != combination_df.columns[len(combination_df.columns) - 1]: str_columns = str_columns + column + ', ' str_s = str_s + '%s, ' else: str_columns = str_columns + column str_s = str_s + '%s' print(str_columns) print(str_s) # 写入数据库 import pymysql as sql conn = sql.connect(host='localhost', user='root', password='20240108', database='usr', charset='utf8') cursor = conn.cursor() drop_table_query = 'drop table if exists combination_table;' create_table_query = f'create table if not exists combination_table({define_type});' insert_table_query = f'insert into combination_table({str_columns}) values ({str_s});' print(drop_table_query) print(create_table_query) print(insert_table_query) data_tuples = [tuple(x) for x in combination_df.to_numpy()] # print(data_tuples) try: # 将 DataFrame 转换为元组列表 data_tuples = [tuple(x) for x in combination_df.to_numpy()] cursor.execute(drop_table_query) cursor.execute(create_table_query) for i in range(len(data_tuples)): cursor.execute(insert_table_query, data_tuples[i]) conn.commit() print("SQL statements executed successfully.") except Exception as e: print(f"Error executing SQL statements: {str(e)}") finally: cursor.close() conn.close() # 将DataFrame写入CSV文件 combination_df.to_csv('combination.csv', index=False)