pandas常用函数

 from functools import reduce

df=reduce(lambda left,right:pd.merge(left,right,on='date'),dfs) ##merge多个dataframe

def func(x1,x2):
return x1*x2,x1+x2

df[['y1', 'y2']] = df.apply(lambda row: func(row['x1'],row['x2']),axis=1, result_type='expand') ##apply函数同时制造2个新列

# 将 'category' 列设置为 MultiIndex 的一个层次
df.set_index('category', append=True, inplace=True)

group = xx.groupby(xx['date'].dt.date) ##96个点按天groupby
for date,df in group:
dic['date']=date.strftime('%Y-%m-%d')

temp = temp[temp[col].isnull()]
abnormal_index = temp.index.tolist()

df=pd.read_csv(path,index_col=[0],parse_dates=['date'])

testset=df.loc[(df['date'] >= pd.to_datetime('2022-5-1')) & (df['date'] < pd.to_datetime('2022-6-1'))]

self.data['target']=self.data['load'].shift(y_shift)

len(df[df['load'].isnull()].index)   ##空缺值个数

# 计算时间索引中每个时间点之间的差异
time_diff = data.index.to_series().diff().dropna()
# 检查是否有不等于15分钟的差异
not_interval = time_diff != pd.Timedelta(minutes=interval)

df.index.values[1] = df.index.values[1] + pd.Timedelta(minutes=2) ##修改索引值

df.isnull().sum()

df['load'][df['load']<0]=0 ##step1:负值变0处理

df.interpolate(method='spline', order=num_of_order) 三样条插值

df2 = df.reset_index(drop=True)

df.rename(columns={'ws50':'ws50_nwp'}, inplace=True)

df.loc[abnormal_index,"abnormal_flag"] = 1

df.loc[:, col].diff()

df["diff"].rolling(window=window - 1).sum()

df.drop(["diff", "roll"], axis=1, inplace=True)

temp.interpolate(method="linear",axis=0,limit_direction ='both',inplace=True)   ##axis=0是竖着按列填

df_res[cols]=df_res[cols].interpolate(method="linear",axis=0)  ##选择自定义列填充

df['real_speed30_class'] = pd.qcut(df['real_speed30'], q=10, labels=False) ##按分位数十等分然后标记类别

temp.fillna(method="ffill", axis=0,inplace=True)   
temp.fillna(method="bfill", axis=0,inplace=True)

start_date=pd.to_datetime('2020/1/1',format='%Y-%m-%d)

xx=df['ds'].dt.strftime('%Y-%m-%d')
time_delta=pd.Timedelta(5,unit='d')

self.suntime["sunset_after"] = self.suntime["Sunset"].apply(lambda x:pd.to_datetime(x).round('15T'))  ##将时间移动至最近15min间隔的点

time=pd.date_range(last_time+,'2024/12/31 23:45',freq='15t')

start_date.strftime('%Y-%m-%d %H:%M:%S')

output=output.iloc[[i for i in range(2,len(output),3)]] ##按5min间隔转15min

df=pd.merge(df,time,on='date',how='right')
start=df[(df['date'].dt.hour==0) & (df['date'].dt.minute==0)].index[0]
end=df[(df['date'].dt.hour==23) & (df['date'].dt.minute==45)].index[-1]
df=df.iloc[start:end+1:3]

df_list=[df.iloc[i*288:(i+1)*288] for i in range(int(len(df)/288))]

pd.infer_freq(trainset['ds'].tail(5))  ##检测时间间隔

a_b = a.pct_change(periods=2,fill_method='pad') ##增长率

df2=df2.sample(frac=1) ##pandas shuffle方法,frac是采样比例,1就是全部

nan_index = df[(df[col] > upper) | (df[col] < lower)].index ##条件查找索引
df.loc[nan_index, col] = np.nan ##按索引赋值

def timeStamp(timeNum):
      timeStamp = float(timeNum/1000)
      timeArray = time.localtime(timeStamp)
      otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
      print(otherStyleTime)
df['date'] = df['timestamp'].apply(timeStamp)   ##timestep转date

def _fun(row):   pandas 遍历行,同时利用多列特征
if row['gt']<0.2*Capacity:
return 1-np.sqrt((row['gt'] - row['pred'])**2/(0.2*Capacity)**2)
else:
return 1-np.sqrt((row['gt'] - row['pred'])**2/Capacity**2)
df['acc']=df.apply(_fun,flag=flag,axis=1)  ##flag可以作为_fun函数的外参

 

# 重新采样为15分钟间隔,不填充缺失值
data = data.resample('15T', closed='right').asfreq()
# 生成新的时间索引,确保开头是00:00,结尾是23:45
new_index = pd.date_range(start=data.index[0].replace(hour=0, minute=0),
end=data.index[-1].replace(hour=23, minute=45),
freq='15T')

# 使用reindex方法重新索引,并得到最终的DataFrame
data = data.reindex(new_index)

posted @ 2022-03-25 12:47  笨笨和呆呆  阅读(108)  评论(0)    收藏  举报