python 样条插值法补全缺失值
我的excel里面有时间列、站点列和MI列,是2002至2022年5-9月的数据,这个只能插值5-9的数据,再多月份不可以,想补全MI的缺失值
采用样条插值法
import pandas as pd import numpy as np from scipy.interpolate import interp1d from itertools import product def spline_interpolation_MI(input_file, output_file=None): """ 使用样条插值法补全MI缺失数据,并将结果写入原文件的新列 参数: input_file: 输入Excel文件路径 output_file: 输出文件路径(可选),如果为None则覆盖原文件 """ # 读取Excel文件 df = pd.read_excel(input_file) # 创建原始数据的副本用于标记插值点 df['MI_interpolated'] = df['MI'].copy() df['is_interpolated'] = False # 标记是否为插值点 # 将time列转换为年份和月份 df['year'] = df['time'].apply(lambda x: int(x.split('-')[0])) df['month'] = df['time'].apply(lambda x: int(x.split('-')[1])) # 获取所有站点 all_stations = df['station'].unique() # 对每个站点进行样条插值 for station in all_stations: # 获取当前站点的数据 station_data = df[df['station'] == station].copy() # 创建时间序号(从0开始的连续整数) station_data['time_num'] = (station_data['year'] - 2002) * 5 + (station_data['month'] - 5) # 分离有数据和无数据的点 known = station_data.dropna(subset=['MI']) unknown = station_data[station_data['MI'].isna()] if len(known) >= 2: # 至少有2个点才能插值 # 创建样条插值函数 spline = interp1d( known['time_num'], known['MI'], kind='cubic', fill_value='extrapolate' ) # 对缺失值进行插值 interpolated_values = spline(unknown['time_num']) # 更新原DataFrame中的插值结果 for idx, row in unknown.iterrows(): original_idx = row.name # 获取原始索引 df.at[original_idx, 'MI_interpolated'] = interpolated_values[unknown['time_num'] == row['time_num']][0] df.at[original_idx, 'is_interpolated'] = True # 清理临时列 df.drop(columns=['year', 'month', 'time_num'], errors='ignore', inplace=True) # 确定输出路径 if output_file is None: output_file = input_file # 保存结果 df.to_excel(output_file, index=False) print(f"插值完成,结果已保存到 {output_file}") print(f"插值点数量: {df['is_interpolated'].sum()}") # 使用示例 input_excel = "/root/suyue/dry/output_RH_interpolated.xlsx" # 替换为你的输入文件路径 spline_interpolation_MI(input_excel) # 不指定output_file将覆盖原文件 # 如果想保留原文件,可以指定输出路径 # spline_interpolation_MI(input_excel, "output_with_interpolation.xlsx")
插值1-12月的数据

import pandas as pd import numpy as np from scipy.interpolate import interp1d def spline_interpolation_RH(input_file, output_file=None): """ 使用样条插值法补全RH缺失数据,并将结果写入原文件的新列 参数: input_file: 输入Excel文件路径 output_file: 输出文件路径(可选),如果为None则覆盖原文件 """ # 读取Excel文件 df = pd.read_excel(input_file) # 创建原始数据的副本用于标记插值点 df['RH_interpolated'] = df['RH'].copy() df['is_interpolated'] = False # 标记是否为插值点 # 将time列转换为年份和月份 df['year'] = df['time'].apply(lambda x: int(x.split('-')[0])) df['month'] = df['time'].apply(lambda x: int(x.split('-')[1])) # 获取所有站点 all_stations = df['station'].unique() # 对每个站点进行插值 for station in all_stations: # 获取当前站点的数据 station_data = df[df['station'] == station].copy() # 创建时间序号(2002-01=0, 2002-02=1, ..., 2022-12=251) station_data['time_num'] = (station_data['year'] - 2002) * 12 + (station_data['month'] - 1) # 检查是否有重复的 time_num(同一站点同一时间的数据) if station_data.duplicated(subset=['time_num']).any(): print(f"警告:站点 {station} 存在重复时间数据,将取平均值处理") # 对重复时间点的 RH 取平均值 station_data = station_data.groupby('time_num').agg({ 'RH': 'mean', 'year': 'first', 'month': 'first', 'station': 'first', 'time': 'first', 'RH_interpolated': 'first', 'is_interpolated': 'first' }).reset_index() # 分离有数据和无数据的点 known = station_data.dropna(subset=['RH']) unknown = station_data[station_data['RH'].isna()] if len(known) >= 4: # 样条插值至少需要4个点 try: # 尝试样条插值 spline = interp1d( known['time_num'], known['RH'], kind='cubic', fill_value='extrapolate' ) interpolated_values = spline(unknown['time_num']) except: print(f"站点 {station} 样条插值失败,改用线性插值") # 如果失败,改用线性插值 linear = interp1d( known['time_num'], known['RH'], kind='linear', fill_value='extrapolate' ) interpolated_values = linear(unknown['time_num']) elif len(known) >= 2: # 至少2个点才能线性插值 linear = interp1d( known['time_num'], known['RH'], kind='linear', fill_value='extrapolate' ) interpolated_values = linear(unknown['time_num']) else: print(f"站点 {station} 数据点不足,无法插值") continue # 更新原DataFrame中的插值结果(修正索引问题) for i, (idx, row) in enumerate(unknown.iterrows()): mask = (df['station'] == station) & (df['time'] == row['time']) df.loc[mask, 'RH_interpolated'] = interpolated_values[i] df.loc[mask, 'is_interpolated'] = True # 清理临时列 df.drop(columns=['year', 'month', 'time_num'], errors='ignore', inplace=True) # 确定输出路径 if output_file is None: output_file = input_file # 保存结果 df.to_excel(output_file, index=False) print(f"插值完成,结果已保存到 {output_file}") print(f"插值点数量: {df['is_interpolated'].sum()}") # 使用示例 input_excel = "F:/lianxi/eerduosi_month.xlsx" # 替换为你的输入文件路径 spline_interpolation_RH(input_excel) # 不指定output_file将覆盖原文件

浙公网安备 33010602011771号