比对两个excel文件数据差异
背景
工作中需要一个测试需求:需要比对两个excel文件的内容,以门店编码为唯一键,比对其他字段值不一致的地方,如有不一致需要写入另外一个文件
解决方案
使用python代码实现
# -*- coding: utf-8 -*-
"""
@File : 数据核对脚本.py
@Author : simon
@email : 294168604@qq.com
@Software: PyCharm
@Time : 2024/7/16 09:00
"""
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import PatternFill
# 读取Excel文件
file1 = 'MDM.xlsx'
file2 = 'MDP.xlsx'
df1 = pd.read_excel(file1)
df2 = pd.read_excel(file2)
# 用于替换NaN的值
na_value = "null"
# 合并数据,比较两个文件
merged_df = pd.merge(df1, df2, on='store_code', suffixes=('_file1', '_file2'), how='outer', indicator=True)
# 新建一个Workbook
wb = Workbook()
ws = wb.active
# 定义黄色填充
yellow_fill = PatternFill(start_color='FFFF00', end_color='FFFF00', fill_type='solid')
red_fill = PatternFill(start_color='FF0000', end_color='FF0000', fill_type='solid')
green_fill = PatternFill(start_color='00FF00', end_color='00FF00', fill_type='solid')
# 写入表头
headers = ['store_code'] + [col for col in df1.columns if col != 'store_code'] + ['_merge']
ws.append(headers)
# 找出不同的地方并写入新的Workbook
for index, row in merged_df.iterrows():
diff = [row['store_code']]
has_difference = False
for col in df1.columns:
if col != 'store_code':
val_file1 = row[f'{col}_file1'] if pd.notna(row[f'{col}_file1']) else na_value
val_file2 = row[f'{col}_file2'] if pd.notna(row[f'{col}_file2']) else na_value
if val_file1 != val_file2:
cell_value = f"{val_file1} -> {val_file2}"
has_difference = True
else:
cell_value = val_file1
diff.append(cell_value)
diff.append(row['_merge'])
ws.append(diff)
if not has_difference:
for cell in ws[-1]:
cell.fill = green_fill
# 标记不同的地方为黄色
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=2, max_col=ws.max_column - 1):
for cell in row:
if '->' in str(cell.value):
cell.fill = yellow_fill
# 标识哪一个表没有某个值
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=ws.max_column, max_col=ws.max_column):
for cell in row:
if cell.value == 'left_only':
cell.fill = red_fill
cell.value = 'MDP表缺失'
elif cell.value == 'right_only':
cell.fill = red_fill
cell.value = 'MDM表缺失'
# 保存文件
output_file = 'diff_output.xlsx'
wb.save(output_file)
效果
可以看到下图将不同的地方做了标黄处理,填入了两个表的值