开篇之始,咱先感叹一下近期AI的迅速发展,国产AI平台Deepseek的发布与崛起,可以说是将整个AI圈子进行清洗了一圈,而且Deepseek还是一个开源的AI,只要是我们有足够的硬件支持,就可以在自己的本地部署建立一套Deepseek的AI工具进行使用。奈何自己属于社会底层一员,就只能是先借用官方Deepseek工具,或者是后台是Deepseek的其他AI工具进行使用,如腾讯元宝等。
这段时间根据业务需求输出的报表文件有点多,还需要对相关数据与原数据进行核对,但是这部分的科目段值有点多,以及相关数据量也比较大,一个科目一个科目的去核对,实在是太费人了,又费眼睛又费脑子更费手。。。。。。
由于之前我简单的入门过关于Python数据处理的内容,我就想开发一个Python的数据校验脚本,帮我去核对这部分输出数据。说干就干!!!
因为我在Python开发上面的精力投入和使用并不是很多,所以我在Python开发上面的能力是非常薄弱的,导致自己最开始开发的初版代码,看上去真的就是纯纯为了实现需求而实现的内容,并没有一点儿设计的元素在内...T_T
在耗费了差不多一整天的时间,以及各种AI搜索,终于是构建出来了可以满足整个数据处理和验证的代码,但是在运行时,真的就是属于顺序执行,也没有使用到面向对象的开发理念。。。话不多说,直接上垃圾代码:
开发环境:
windows10;
conda version: conda 24.11.3
python version(conda env): Python 3.10.16
pandas version: 2.2.3
cchardet version: 2.1.7
Tips:真属于是垃圾代码,各位大佬请手下留情~~~
初版垃圾代码【点击查看代码】
import os
import cchardet
import pandas as pd
from datetime import datetime as dt
jeDataFile = r"C:\Users\Desktop\0010JE.xlsx"
jeFileColumns = ["PERIOD", "ACCOUNT", "IC", "ACCOUNTED_DR", "ACCOUNTED_CR", "JE_SOURCE"]
balanceDataFile = r"C:\Users\Desktop\校验用表.xlsx"
balanceColumns = ["ACCOUNT", "IC", "BEGINNINGBALANCE", "PERIODACTIVITY", "ENDINGBALANCE"]
targetDataFile = r"C:\Users\Desktop\供应商.xls"
targetFileColumns = ["会计科目", "IC", "期初余额", "本期借方", "本期贷方", "期末余额"]
utf8_data_file = r"utf8_data.xls"
# pandas 配置
# 显示列的所有内容,列宽无限制
pd.set_option('display.max_colwidth', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 显示所有列
pd.set_option('display.max_columns', None)
# 设置Pandas全局显示格式(不影响数据存储)
pd.options.display.float_format = '{:.2f}'.format
def detect_encoding(file_path):
with open(file_path, 'rb') as f:
raw_data = f.read()
result = cchardet.detect(raw_data)
return result['encoding']
def read_and_convert(gbk_file, utf8_file):
# 读取二进制数据并解码(优先尝试GB2312,若失败则用GB18030)
try:
with open(gbk_file, 'rb') as f:
content = f.read().decode(encoding="GB2312")
except UnicodeDecodeError:
with open(gbk_file, 'rb') as f:
content = f.read().decode('GB18030') # 更兼容的中文编码
# 替换HTML中的编码声明为UTF-8
content = content.replace('charset=GB2312', 'charset=UTF-8')
print(f"构建UTF-8格式文件...")
# 写入UTF8文件
with open(utf8_file, 'w', encoding='utf-8') as f:
f.write(content)
def check_utf8_file(original_file, utf8_file):
if os.path.exists(utf8_file):
encode = detect_encoding(utf8_file)
if encode != "utf-8":
read_and_convert(original_file, utf8_file)
else:
read_and_convert(original_file, utf8_file)
def read_je_data(je_data_file, segment3: str, segment8: str):
je_sum_data = {}
global je_balance
try:
je_df = pd.read_excel(je_data_file, engine="openpyxl")
# 数据类型转换及过滤列
je_df = je_df.assign(PERIOD=lambda x: x["PERIOD"].astype(str).str.strip(),
ACCOUNT=lambda x: x["ACCOUNT"].astype(str).str.strip(),
IC=lambda x: x["IC"].astype(str).str.pad(3, fillchar='0'),
ACCOUNTED_DR=lambda x: pd.to_numeric(x["ACCOUNTED_DR"], errors='coerce').round(2),
ACCOUNTED_CR=lambda x: pd.to_numeric(x["ACCOUNTED_CR"], errors='coerce').round(2)
).filter(jeFileColumns)
account_data = je_df[(je_df["ACCOUNT"] == segment3) & (je_df["IC"] == segment8)]
if not account_data.empty:
je_summary = account_data[["ACCOUNTED_DR", "ACCOUNTED_CR"]].sum()
else:
print(f"日记账数据为空;科目段:{segment3};IC段:{segment8}")
je_summary = pd.Series({"ACCOUNTED_DR": 0.00,
"ACCOUNTED_CR": 0.00})
je_balance = (je_summary["ACCOUNTED_DR"] - je_summary["ACCOUNTED_CR"]).round(2)
je_sum_data["ACCOUNTED_DR"] = je_summary["ACCOUNTED_DR"].astype(float).round(2)
je_sum_data["ACCOUNTED_CR"] = je_summary["ACCOUNTED_CR"].astype(float).round(2)
je_sum_data["JE_BALANCE"] = je_balance.astype(float).round(2)
return je_sum_data
except:
print(f"处理日记账明细数据文件{je_data_file}异常!")
def read_balances_data(balance_data_file, segment3: str, segment8: str):
balances = {}
try:
balance_df = pd.read_excel(balance_data_file, engine="openpyxl")
# 数据类型强制转换及过滤列
balance_df = balance_df.assign(ACCOUNT=lambda x: x["ACCOUNT"].astype(str).str.strip(),
IC=lambda x: x["IC"].astype(str).str.pad(3, fillchar='0'),
BEGINNINGBALANCE=lambda x: pd.to_numeric(x["BEGINNINGBALANCE"],
errors='coerce').round(2),
PERIODACTIVITY=lambda x: pd.to_numeric(x["PERIODACTIVITY"],
errors='coerce').round(2),
ENDINGBALANCE=lambda x: pd.to_numeric(x["ENDINGBALANCE"], errors='coerce').round(
2)).filter(balanceColumns)
balances_data = balance_df[(balance_df["ACCOUNT"] == segment3) & (balance_df["IC"] == segment8)]
if not balances_data.empty:
balance_summary = balances_data[["BEGINNINGBALANCE", "PERIODACTIVITY", "ENDINGBALANCE"]].sum()
else:
balance_summary = pd.Series({"BEGINNINGBALANCE": 0.00,
"PERIODACTIVITY": 0.00,
"ENDINGBALANCE": 0.00})
balances["BEGINNINGBALANCE"] = balance_summary["BEGINNINGBALANCE"].astype(float).round(2)
balances["PERIODACTIVITY"] = balance_summary["PERIODACTIVITY"].astype(float).round(2)
balances["ENDINGBALANCE"] = balance_summary["ENDINGBALANCE"].astype(float).round(2)
return balances
except:
print(f"读取余额表数据文件{balance_data_file}异常;")
def get_request_account(utf8_file):
try:
# 读取数据
df = pd.read_html(utf8_file, flavor="html5lib")[1]
# 类型转换管道操作(使用assign避免原地修改)
coa_df = df.assign(会计科目=lambda x: x["会计科目"].astype(str).str.strip(),
IC=lambda x: x["IC"].astype(str).str.pad(3, fillchar='0')).filter(["会计科目", "IC"])
accounts = coa_df["会计科目"].unique()
ics = coa_df["IC"].unique()
return accounts, ics
except:
print("获取请求输出文件中的科目组合失败!")
def get_request_parameter(utf8_file):
try:
period = pd.read_html(utf8_file, flavor="html5lib")[0].iloc[1, 1]
period_from, period_to = period[2:8], period[9:15]
# 转换日期格式为 YYYY-MM
period_from_format = dt.strptime(period_from, "%y-%b").strftime("%Y-%m")
period_to_format = dt.strptime(period_to, "%y-%b").strftime("%Y-%m")
print(f"数据期间为 {period_from_format} 至 {period_to_format}")
except:
print(f"读取xls文件的table1异常!")
def read_xls_data(utf8_file, segment3: str, segment8: str):
balance_data = {}
# 读取期间参数
global net_balance
try:
# 读取数据
xls_df = pd.read_html(utf8_file, flavor="html5lib")[1]
# 类型转换管道操作(使用assign避免原地修改)
data_df = xls_df.assign(会计科目=lambda x: x["会计科目"].astype(str).str.strip(),
IC=lambda x: x["IC"].astype(str).str.pad(3, fillchar='0'),
期初余额=lambda x: pd.to_numeric(x["期初余额"], errors='coerce').round(2),
本期借方=lambda x: pd.to_numeric(x["本期借方"], errors='coerce').round(2),
本期贷方=lambda x: pd.to_numeric(x["本期贷方"], errors='coerce').round(2),
期末余额=lambda x: pd.to_numeric(x["期末余额"], errors='coerce').round(2)
).filter(targetFileColumns)
if segment3 is not None and segment8 is not None:
account_amount = data_df[(data_df["会计科目"] == segment3) & (data_df["IC"] == segment8)]
if not account_amount.empty:
summary_data = account_amount[["期初余额", "本期借方", "本期贷方", "期末余额"]].sum()
else:
print(f"筛选数据为空。")
summary_data = pd.Series({"期初余额": 0.00,
"本期借方": 0.00,
"本期贷方": 0.00,
"期末余额": 0.00})
net_balance = (summary_data["本期借方"] - summary_data["本期贷方"]).round(2)
balance_data["期初余额"] = summary_data['期初余额'].astype(float).round(2)
balance_data["本期借方"] = summary_data['本期借方'].astype(float).round(2)
balance_data["本期贷方"] = summary_data['本期贷方'].astype(float).round(2)
balance_data["本期余额"] = net_balance.astype(float).round(2)
balance_data["期末余额"] = summary_data['期末余额'].astype(float).round(2)
return balance_data
else:
print(f"筛选科目不可为空!")
return None
except:
print(f"目标文件{utf8_file}不存在.")
def check_equal(je_data: float, balance_data: float):
if je_data is not None and balance_data is not None:
return je_data == balance_data
def _drop_file(utf8_file):
try:
if os.path.isfile(utf8_file): # 确保是文件而非目录
os.remove(utf8_file)
elif os.path.exists(utf8_file):
print(f"路径 {utf8_file} 是文件夹,未执行删除")
return False
else:
print(f"文件 {utf8_file} 不存在或已经删除。")
except PermissionError:
print(f"权限不足,无法删除 {utf8_file}")
return False
except Exception as e:
print(f"删除 {utf8_file} 时发生未知错误: {str(e)}")
return False
if __name__ == '__main__':
user_input = str(input("并发请求输出文件【全路径+名称】(默认 {}):".format(targetDataFile)).strip())
# print(f'校验文件为:r"{user_input}" ')
request_file = r"{}".format(user_input) if r"{}".format(user_input) else targetDataFile
print(request_file)
# print(f"输入为空,自动使用默认输出文件:{targetDataFile}")
# 新建utf8编码文件,源文件编码为GB2312/GB18030
check_utf8_file(request_file, utf8_data_file)
# print(get_request_account(utf8_data_file)[0])
ic_segment = get_request_account(utf8_data_file)[1]
ic_segment = ic_segment[ic_segment != 'YYY']
# print(get_request_account(utf8_data_file)[1])
for acc in (get_request_account(utf8_data_file)[0]):
for ic in ic_segment:
# deal the output file
xls_data = read_xls_data(utf8_file=utf8_data_file, segment3=acc, segment8=ic)
# deal the je data file
balance_data = read_balances_data(balanceDataFile, segment3=acc, segment8=ic)
# 期初金额判断
print(f"当前科目段:{acc},IC段:{ic} ======> ")
if check_equal(xls_data["期初余额"], balance_data["BEGINNINGBALANCE"]):
print(f"科目段{acc},IC段{ic} 的期初数据一致。报表期初余额:{xls_data['期初余额']},日记账期初余额:{balance_data['BEGINNINGBALANCE']}")
else:
print(f"科目段{acc},IC段{ic} 的期初数据不等。报表期初余额:{xls_data['期初余额']},日记账期初余额:{balance_data['BEGINNINGBALANCE']}")
# 当期发生额余额判断
if check_equal(xls_data["本期余额"], balance_data["PERIODACTIVITY"]):
print(f"科目段{acc},IC段{ic} 的当期余额一致。报表当期余额:{xls_data['本期余额']},日记账余额:{balance_data['PERIODACTIVITY']}")
else:
print(f"科目段{acc},IC段{ic} 的期初数据不等。报表当期余额:{xls_data['本期余额']},日记账余额:{balance_data['PERIODACTIVITY']}")
# 期末余额判断
if check_equal(xls_data["期末余额"], balance_data["ENDINGBALANCE"]):
print(f"科目段{acc},IC段{ic} 的期末数据一致。报表期末余额:{xls_data['期末余额']},日记账期末余额:{balance_data['ENDINGBALANCE']}")
else:
print(f"科目段{acc},IC段{ic} 的期末数据不等。报表期末余额:{xls_data['期末余额']},日记账期末余额:{balance_data['ENDINGBALANCE']}")
# 删除生成的UTF8文件
_drop_file(utf8_data_file)
虽然说最后实现的数据处理是完成的我的需求内容,但是运行起来之后,发现跑的是真有些慢,是属于一个一个往出来蹦的那种。。。
但是!但是!但是!咱们不是有AI工具嘛,我直接丢给腾讯元宝,使用Deepseek模型帮我优化。。。哈哈哈哈哈
是的,他帮我优化出来了,也同样实现了我的需求项,运行起来是很快的,在解析完毕数据文件之后,几乎结果是一秒就出的。同样我也开始了自闭和自我怀疑:
- 我之前写的那是Python代码嘛?
- 我之前学的是Python语言嘛?
- 为啥Deepseek写出来的代码我一下看不懂了呢?
我只看到它优化出来的代码用到了面向对象的思想,其他的内容我是看着一脸懵圈。。。
来吧,我们一起看看这部分Deepseek优化出来的代码吧:
"""
功能:比对日记账、余额表与EBS请求报表数据的一致性
"""
import os
import logging
from datetime import datetime as dt
from typing import Tuple, Dict, Optional
import cchardet
import pandas as pd
from pandas import DataFrame
# --------------------------
# 配置类(参数集中管理)
# --------------------------
class Config:
# 文件路径
JE_DATA_PATH = r"C:\Users\w01986\Desktop\0010JE.xlsx"
BALANCE_DATA_PATH = r"C:\Users\w01986\Desktop\校验用表.xlsx"
TARGET_DATA_PATH = r"C:\Users\w01986\Desktop\供应商.xls"
UTF8_TEMP_FILE = r"utf8_temp.xls"
# 列映射
JE_COLUMNS = ["PERIOD", "ACCOUNT", "IC", "ACCOUNTED_DR", "ACCOUNTED_CR"]
BALANCE_COLUMNS = ["ACCOUNT", "IC", "BEGINNINGBALANCE", "PERIODACTIVITY", "ENDINGBALANCE"]
TARGET_COLUMNS = ["会计科目", "IC", "期初余额", "本期借方", "本期贷方", "期末余额"]
# 日志配置
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
# --------------------------
# 初始化配置
# --------------------------
logging.basicConfig(level=logging.INFO, format=Config.LOG_FORMAT)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format
# --------------------------
# 工具函数
# --------------------------
def detect_file_encoding(file_path: str) -> str:
"""检测文件编码"""
with open(file_path, "rb") as f:
return cchardet.detect(f.read())["encoding"]
def read_and_convert(gbk_file, utf8_file):
# 读取二进制数据并解码(优先尝试GB2312,若失败则用GB18030)
try:
with open(gbk_file, 'rb') as f:
content = f.read().decode(encoding="GB2312")
except UnicodeDecodeError:
with open(gbk_file, 'rb') as f:
content = f.read().decode('GB18030') # 更兼容的中文编码
# 替换HTML中的编码声明为UTF-8
content = content.replace('charset=GB2312', 'charset=UTF-8')
logging.info(f"构建UTF-8格式文件...")
# 写入UTF8文件
with open(utf8_file, 'w', encoding='utf-8') as f:
f.write(content)
# --------------------------
# 数据处理核心类
# --------------------------
class DataProcessor:
def __init__(self):
self.je_df = self._load_je_data()
self.balance_df = self._load_balance_data()
self.target_df = self._load_target_data()
@staticmethod
def _preprocess_data(
df: DataFrame,
str_cols: list,
num_cols: list,
ic_col: str = "IC"
) -> DataFrame:
"""通用数据预处理"""
# 类型转换
for col in str_cols:
df[col] = df[col].astype(str).str.strip()
for col in num_cols:
df[col] = pd.to_numeric(df[col], errors="coerce").round(2)
# IC列特殊处理
if ic_col in df.columns:
df[ic_col] = df[ic_col].str.pad(3, fillchar="0")
# 清理 str_cols 的列中为空的行
return df.dropna(subset=str_cols)
def _load_je_data(self) -> DataFrame:
"""加载日记账数据"""
try:
df = pd.read_excel(
Config.JE_DATA_PATH,
engine="openpyxl",
usecols=Config.JE_COLUMNS
)
return self._preprocess_data(
df=df,
str_cols=["PERIOD", "ACCOUNT", "IC"],
num_cols=["ACCOUNTED_DR", "ACCOUNTED_CR"]
)
except Exception as e:
logging.error(f"日记账数据加载失败: {str(e)}")
raise
def _load_balance_data(self) -> DataFrame:
"""加载余额表数据"""
try:
df = pd.read_excel(
Config.BALANCE_DATA_PATH,
engine="openpyxl",
usecols=Config.BALANCE_COLUMNS
)
return self._preprocess_data(
df=df,
str_cols=["ACCOUNT", "IC"],
num_cols=["BEGINNINGBALANCE", "PERIODACTIVITY", "ENDINGBALANCE"]
)
except Exception as e:
logging.error(f"余额表数据加载失败: {str(e)}")
raise
@staticmethod
def format_datetime(period_name: str) -> str:
try:
return dt.strptime(period_name, "%y-%b").strftime("%Y-%m")
except Exception as e:
logging.error(f"期间格式转换失败:{period_name}")
def _load_target_data(self) -> DataFrame:
"""加载目标数据并转换编码"""
if not os.path.exists(Config.UTF8_TEMP_FILE):
read_and_convert(Config.TARGET_DATA_PATH, Config.UTF8_TEMP_FILE)
try:
period = pd.read_html(Config.UTF8_TEMP_FILE, flavor="html5lib")[0].iloc[1, 1]
period_from, period_to = self.format_datetime(period[2:8]), self.format_datetime(period[9:15])
logging.info(f"报表数据文件的期间为:{period_from} 至 {period_to}")
df = pd.read_html(Config.UTF8_TEMP_FILE, flavor="html5lib")[1]
return self._preprocess_data(
df=df,
str_cols=["会计科目", "IC"],
num_cols=["期初余额", "本期借方", "本期贷方", "期末余额"],
ic_col="IC"
).rename(columns={"会计科目": "ACCOUNT"})
except Exception as e:
logging.error(f"目标数据加载失败: {str(e)}")
raise
def _query_dataset(
self,
df: DataFrame,
account: str,
ic: str,
sum_cols: list
) -> Dict[str, float]:
"""通用数据查询方法"""
filtered = df[(df["ACCOUNT"] == account) & (df["IC"] == ic)]
if filtered.empty:
logging.warning(f"未找到数据: 科目={account}, IC={ic}")
return {col: 0.0 for col in sum_cols}
return filtered[sum_cols].sum().round(2).to_dict()
def get_je_summary(self, account: str, ic: str) -> Dict[str, float]:
"""获取日记账汇总数据"""
result = self._query_dataset(
self.je_df, account, ic, ["ACCOUNTED_DR", "ACCOUNTED_CR"]
)
result["BALANCE"] = result["ACCOUNTED_DR"] - result["ACCOUNTED_CR"]
return result
def get_balance_summary(self, account: str, ic: str) -> Dict[str, float]:
"""获取余额表汇总数据"""
return self._query_dataset(
self.balance_df,
account,
ic,
["BEGINNINGBALANCE", "PERIODACTIVITY", "ENDINGBALANCE"]
)
def get_target_summary(self, account: str, ic: str) -> Dict[str, float]:
"""获取目标报表汇总数据"""
result = self._query_dataset(
self.target_df,
account,
ic,
["期初余额", "本期借方", "本期贷方", "期末余额"]
)
result["本期余额"] = result["本期借方"] - result["本期贷方"]
return result
# --------------------------
# 业务逻辑处理器
# --------------------------
class ValidationEngine:
@staticmethod
def compare_values(
label: str,
expected: float,
actual: float,
tolerance: float = 0.01
) -> Tuple[bool, str]:
"""数值比对工具"""
diff = abs(expected - actual)
msg = (
f"{label}比对结果: "
f"预期={expected:.2f}, 实际={actual:.2f}, 差异={diff:.2f}"
)
return (diff <= tolerance), msg
def run_validation(self) -> None:
"""执行完整校验流程"""
processor = DataProcessor()
target_df = processor.target_df
# 获取需要校验的科目组合
unique_pairs = target_df[["ACCOUNT", "IC"]].drop_duplicates().values
for account, ic in unique_pairs:
if ic == "YYY":
continue
logging.info(f"\n{'=' * 40}\n正在校验: 科目={account}, IC={ic}\n{'=' * 40}")
try:
# 获取三套数据
target_data = processor.get_target_summary(account, ic)
je_data = processor.get_je_summary(account, ic)
balance_data = processor.get_balance_summary(account, ic)
# 执行比对
self._validate_beginning_balance(target_data, balance_data)
self._validate_period_activity(je_data, balance_data)
self._validate_ending_balance(target_data, balance_data)
except Exception as e:
logging.error(f"校验过程中发生异常: {str(e)}")
def _validate_beginning_balance(
self,
target: dict,
balance: dict
) -> None:
"""期初余额校验"""
is_match, msg = self.compare_values(
"期初余额",
target["期初余额"],
balance["BEGINNINGBALANCE"]
)
logging.info(msg if is_match else f"!! 异常 !! {msg}")
def _validate_period_activity(
self,
je: dict,
balance: dict
) -> None:
"""本期发生额校验"""
is_match, msg = self.compare_values(
"本期发生额",
je["BALANCE"],
balance["PERIODACTIVITY"]
)
logging.info(msg if is_match else f"!! 异常 !! {msg}")
def _validate_ending_balance(
self,
target: dict,
balance: dict
) -> None:
"""期末余额校验"""
is_match, msg = self.compare_values(
"期末余额",
target["期末余额"],
balance["ENDINGBALANCE"]
)
logging.info(msg if is_match else f"!! 异常 !! {msg}")
# --------------------------
# 主程序
# --------------------------
if __name__ == "__main__":
try:
# 用户输入处理
user_input = input(f"请输入文件路径(默认:{Config.TARGET_DATA_PATH}): ").strip()
Config.TARGET_DATA_PATH = user_input if user_input else Config.TARGET_DATA_PATH
# 数据校验对比表
base_file = input(f"请输入文件路径(默认:{Config.JE_DATA_PATH}): ").strip()
Config.JE_DATA_PATH = user_input if user_input else Config.JE_DATA_PATH
# 执行校验
ValidationEngine().run_validation()
except KeyboardInterrupt:
logging.warning("用户中断操作")
finally:
# 清理临时文件
if os.path.exists(Config.UTF8_TEMP_FILE):
os.remove(Config.UTF8_TEMP_FILE)
logging.info("临时文件已清理")
- 感谢 Deepseek!
- 感谢 腾讯元宝!
+++==========================================================================+++
以上内容仅为个人学习记录使用。
如有转载,请附有原文出处链接。
浙公网安备 33010602011771号