pandas 比较2个df的不同
pandas, 比较2个dataframe, 其中basedf是基础表, 有所有的数据列, odoodf是数据表里面的列在basedf里都存在, 但是行数可能没有basedf多, 现在需要比较这2个df, 比较指定的几个列,这几个列必须在2个df里都存在. 基于一个主键列, 如果这2个df任何一列的数据不一样, 把结果列出来, 同时如果basedf某些主键的行数据在odoodf不存在, 也列出来, 结果放在同一个结果表里.
import pandas as pd
def compare_dataframes_detailed(basedf, odoodf, key_column, compare_columns):
# 确保列存在
required_columns = [key_column] + compare_columns
for col in required_columns:
if col not in basedf.columns or col not in odoodf.columns:
raise ValueError(f"列 '{col}' 在其中一个DataFrame中不存在")
# 设置索引
base = basedf[required_columns].set_index(key_column)
odoo = odoodf[required_columns].set_index(key_column)
# 找出所有差异
differences = []
# 检查odoo中缺失的行
missing_keys = base.index.difference(odoo.index)
for key in missing_keys:
diff = {
'主键': key,
'差异类型': 'odoo中缺失',
**base.loc[key].to_dict()
}
differences.append(diff)
# 检查共同存在的行是否有差异
common_keys = base.index.intersection(odoo.index)
for key in common_keys:
base_row = base.loc[key]
odoo_row = odoo.loc[key]
if not base_row.equals(odoo_row):
diff_cols = [col for col in compare_columns if base_row[col] != odoo_row[col]]
diff = {
'主键': key,
'差异类型': '数据不一致',
'差异列': ', '.join(diff_cols),
**base_row.to_dict()
}
# 添加odoo的值作为参考
for col in diff_cols:
diff[f'odoo_{col}'] = odoo_row[col]
differences.append(diff)
# 创建结果DataFrame
if differences:
return pd.DataFrame(differences)
return pd.DataFrame(columns=['主键', '差异类型'] + compare_columns)
# 示例数据
basedf = pd.DataFrame({
'id': [1, 2, 3, 4],
'name': ['Alice', 'Bob', 'Charlie', 'David'],
'age': [25, 30, 35, 40],
'salary': [5000, 6000, 7000, 8000]
})
odoodf = pd.DataFrame({
'id': [1, 2, 4],
'name': ['Alice', 'Bobby', 'David'],
'age': [25, 30, 45],
'salary': [5000, 6000, 8000]
})
# 比较
key_column = 'id'
compare_columns = ['name', 'age', 'salary']
result = compare_dataframes_detailed(basedf, odoodf, key_column, compare_columns)
print(result)
self
函数
def compare_dfs(old_df: pd.DataFrame, new_df: pd.DataFrame, key: str, cols_to_compare: list[str]):
for col in cols_to_compare:
if col not in old_df.columns or col not in new_df.columns:
raise ValueError(f"列 '{col}' 在其中一个DataFrame中不存在")
old_df = old_df[cols_to_compare]
new_df = new_df[cols_to_compare]
old_df.columns = ["old_{}".format(x) if x!=key else key for x in old_df.columns]
new_df.columns = ["new_{}".format(x) if x!=key else key for x in new_df.columns]
df = pd.merge(old_df, new_df, on=key, how="outer")
df['status'] = ''
r = pd.DataFrame()
for _, row in df.iterrows():
t = 0
for col in cols_to_compare:
if col != key:
old = row["old_{}".format(col)]
new = row["new_{}".format(col)]
row[col] = True if old == new else False
if old == new:
row[col] = True
else:
t = t + 1
row[col] = False
row['status'] = '{}个不同'.format(t) if t != len(cols_to_compare)-1 else '不存在'
r = pd.concat([r, pd.DataFrame([row.to_dict()])], axis=0)
return r
调用
def compare():
lastweek_path = basedict['currentweek']
lastweek_df = pd.read_excel(lastweek_path, dtype=str, na_values={None: 'N/A'}, keep_default_na=False)
currentdf = get_customer_master_data()
cols = list(currentdf.columns)
for col in cols:
currentdf[col] = currentdf[col].fillna('')
lastweek_df[col] = lastweek_df[col].fillna('')
currentdf[col] = currentdf[col].astype(str)
lastweek_df[col] = lastweek_df[col].astype(str)
result = compare_dfs(old_df=lastweek_df, new_df=currentdf, key='uid', cols_to_compare=cols)
lastweek_df.to_excel(basedict['lastweek'], index=False)
currentdf.to_excel(basedict['currentweek'], index=False)
result.to_excel(basedict['compare'], index=False)
浙公网安备 33010602011771号