pandas 比较2个df的不同

pandas, 比较2个dataframe, 其中basedf是基础表, 有所有的数据列, odoodf是数据表里面的列在basedf里都存在, 但是行数可能没有basedf多, 现在需要比较这2个df, 比较指定的几个列,这几个列必须在2个df里都存在. 基于一个主键列, 如果这2个df任何一列的数据不一样, 把结果列出来, 同时如果basedf某些主键的行数据在odoodf不存在, 也列出来, 结果放在同一个结果表里.

 

import pandas as pd


def compare_dataframes_detailed(basedf, odoodf, key_column, compare_columns):
    # 确保列存在
    required_columns = [key_column] + compare_columns
    for col in required_columns:
        if col not in basedf.columns or col not in odoodf.columns:
            raise ValueError(f"列 '{col}' 在其中一个DataFrame中不存在")

    # 设置索引
    base = basedf[required_columns].set_index(key_column)
    odoo = odoodf[required_columns].set_index(key_column)

    # 找出所有差异
    differences = []

    # 检查odoo中缺失的行
    missing_keys = base.index.difference(odoo.index)
    for key in missing_keys:
        diff = {
            '主键': key,
            '差异类型': 'odoo中缺失',
            **base.loc[key].to_dict()
        }
        differences.append(diff)

    # 检查共同存在的行是否有差异
    common_keys = base.index.intersection(odoo.index)
    for key in common_keys:
        base_row = base.loc[key]
        odoo_row = odoo.loc[key]

        if not base_row.equals(odoo_row):
            diff_cols = [col for col in compare_columns if base_row[col] != odoo_row[col]]
            diff = {
                '主键': key,
                '差异类型': '数据不一致',
                '差异列': ', '.join(diff_cols),
                **base_row.to_dict()
            }
            # 添加odoo的值作为参考
            for col in diff_cols:
                diff[f'odoo_{col}'] = odoo_row[col]

            differences.append(diff)

    # 创建结果DataFrame
    if differences:
        return pd.DataFrame(differences)
    return pd.DataFrame(columns=['主键', '差异类型'] + compare_columns)

# 示例数据
basedf = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 35, 40],
    'salary': [5000, 6000, 7000, 8000]
})

odoodf = pd.DataFrame({
    'id': [1, 2, 4],
    'name': ['Alice', 'Bobby', 'David'],
    'age': [25, 30, 45],
    'salary': [5000, 6000, 8000]
})

# 比较
key_column = 'id'
compare_columns = ['name', 'age', 'salary']

result = compare_dataframes_detailed(basedf, odoodf, key_column, compare_columns)
print(result)

  

 

self

函数

def compare_dfs(old_df: pd.DataFrame, new_df: pd.DataFrame, key: str, cols_to_compare: list[str]):
    for col in cols_to_compare:
        if col not in old_df.columns or col not in new_df.columns:
            raise ValueError(f"列 '{col}' 在其中一个DataFrame中不存在")
    old_df = old_df[cols_to_compare]
    new_df = new_df[cols_to_compare]
    old_df.columns = ["old_{}".format(x) if x!=key else key for x in old_df.columns]
    new_df.columns = ["new_{}".format(x) if x!=key else key for x in new_df.columns]
    df = pd.merge(old_df, new_df, on=key, how="outer")
    df['status'] = ''
    r = pd.DataFrame()
    for _, row in df.iterrows():
        t = 0
        for col in cols_to_compare:
            if col != key:
                old = row["old_{}".format(col)]
                new = row["new_{}".format(col)]
                row[col] = True if old == new else False
                if old == new:
                    row[col] = True
                else:
                    t = t + 1
                    row[col] = False
        row['status'] = '{}个不同'.format(t) if t != len(cols_to_compare)-1 else '不存在'
        r = pd.concat([r, pd.DataFrame([row.to_dict()])], axis=0)
    return r

  

 

 

 

 

 

调用

def compare():
    lastweek_path = basedict['currentweek']
    lastweek_df = pd.read_excel(lastweek_path, dtype=str, na_values={None: 'N/A'}, keep_default_na=False)
    currentdf = get_customer_master_data()
    cols = list(currentdf.columns)
    for col in cols:
        currentdf[col] = currentdf[col].fillna('')
        lastweek_df[col] = lastweek_df[col].fillna('')
        currentdf[col] = currentdf[col].astype(str)
        lastweek_df[col] = lastweek_df[col].astype(str)
    result = compare_dfs(old_df=lastweek_df, new_df=currentdf, key='uid', cols_to_compare=cols)
    lastweek_df.to_excel(basedict['lastweek'], index=False)
    currentdf.to_excel(basedict['currentweek'], index=False)
    result.to_excel(basedict['compare'], index=False)

  

posted @ 2025-04-28 16:03  CrossPython  阅读(155)  评论(0)    收藏  举报