使用Pandas处理时间序列数据(对齐K线、重采样、滚动计算)

数据模拟和加载
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('D:/data/AG2406.csv')

# Preprocess the data    
data['TradingTime'] = pd.to_datetime(data['TradingTime'])   
data['Volume'] = data['TotalVolume'] - data['TotalVolume'].shift(1).fillna(0)

data['Open'] = data['LastPrice']
data['High'] = data['LastPrice']
data['Low'] = data['LastPrice']
# Rename the columns
data=data.rename(columns={'LastPrice':'Close'})
df_seseco = data.set_index('TradingTime')
# [Symbol, Exchange, TradingDate, TradingTime, OpenPrice, LastPrice, HighPrice, LowPrice, BuyPrice01, BuyPrice02, BuyPrice03, BuyPrice04, BuyPrice05, SellPrice01, SellPrice02, SellPrice03, SellPrice04, SellPrice05, BuyVolume01, BuyVolume02, BuyVolume03, BuyVolume04, BuyVolume05, SellVolume01, SellVolume02, SellVolume03, SellVolume04, SellVolume05, PriceUpLimit, PriceDownLimit, TotalVolume, PreTotalPosition, TotalPosition, Turnover, SettlePrice, PreSettlePrice, ClosePrice, PreClosePrice]
 
df_1min = df_seseco.resample('1min').agg({'Open': 'first', 'High':'max', 'Low':'min', 'Close': 'last', 'Volume':'sum'})

df_1min.dropna(inplace=True)

#df_1min.head(100)

df_5min = df_1min.resample('5T').agg({'Open': 'first', 'High':'max', 'Low':'min', 'Close': 'last', 'Volume':'sum'})
df_5min.dropna(inplace=True)


重采样

# 将1分钟数据重采样为15分钟K线
resampled = df_1min.resample('15T').agg({'Open': 'first', 'High':'max', 'Low':'min', 'Close': 'last', 'Volume':'sum'})

# 前向填充缺失值(限连续缺失不超过3个)
resampled = resampled.fillna(method='ffill', limit=3)

绘制k线图

import matplotlib.pyplot as plt
import mplfinance as mpf
import matplotlib.dates as mdates


mpf.plot(df_1min,type='candle',style='charles',title='1分钟K线图',ylabel='价格',ylabel_lower='成交量',show_nontrading=False,volume=True)


mpf.plot(df_5min,type='candle',style='charles',title='5分钟K线图',ylabel='价格',ylabel_lower='成交量',volume=True,show_nontrading=False)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mplfinance as mpf
import matplotlib.dates as mdates

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体显示中文
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题



# Plotting the candlestick chart
fig, ax = plt.subplots(figsize=(14, 7))  # 调整图表大小
mpf.plot(df_1min, type='candle', ax=ax, style='charles', ylabel='价格', title='K线图')

# Remove the gaps for non-trading hours (处理休市期间的空白)
ax.xaxis.set_major_locator(mdates.HourLocator(interval=1))  # 设置x轴间隔为1小时
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))  # 设置x轴格式

# Rotate date labels for better readability (旋转日期标签以便更好地阅读)
plt.xticks(rotation=45)

# Display the plot
plt.show()
# 合并不同频率数据 时间序列对齐
# 合并不同频率数据
aligned_df = pd.merge_asof(df_1min.reset_index(), 
                          df_5min.reset_index(), 
                          on='TradingTime',
                          suffixes=('_1min', '_5min'))
print(aligned_df)

滚动特征计算

# 简单移动平均(等权重)
df['Volume'].rolling(26).mean()

# 指数加权平均(衰减权重)
df['Volume'].ewm(span=26).mean()

#双均线
df_1min['MA20'] = df_1min['Close'].rolling(window=20).mean()
df_1min['MA60'] = df_1min['Close'].rolling(window=60).mean()

#波动率计算(基于一小时窗口)
df_1min['hourly_volatility'] = df_1min['Close'].rolling(window=60).std() * np.sqrt(60) # 年化波动率

# 带衰减因子的成交量加权平均
df_1min['volume_EMA'] = (
    df_1min['Volume']
    .ewm(span=26, adjust=False)
    .mean()
)

5. 时间序列可视化验证

import matplotlib.pyplot as plt

fix, ax = plt.subplots(4,1,figsize=(16,12))
df_1min[['Close','MA20','MA60']].plot(ax=ax[0],title='price & moving averages')
df_1min['hourly_volatility'].plot(ax=ax[1],title='hourly volatility')
df_1min['volume_EMA'].plot(ax=ax[2],title='volume EMA')
df_5min['Close'].plot(ax=ax[3],title='price 5min')
plt.tight_layout()
plt.show()
异常值检测
#基于布林带识别异常波动
rolling_mean = df_1min['Close'].rolling(20).mean()
rolling_std= df_1min['Close'].rolling(20).std()
df_1min['Upper'] = rolling_mean + 2*rolling_std
df_1min['Lower'] = rolling_mean - 2*rolling_std

#标记突破布林带异常点
df_1min['anomaly'] = np.where((df_1min['Close'] > df_1min['Upper']) | (df_1min['Close'] < df_1min['Lower']), 1, 0)

df_1min['anomaly'] 

TradingTime

2024-04-18 20:59:00 0

2024-04-18 21:00:00 0

2024-04-18 21:01:00 0

2024-04-18 21:02:00 0

2024-04-18 21:03:00 0

..

2024-04-19 14:57:00 1

2024-04-19 14:58:00 1

2024-04-19 14:59:00 1

2024-04-19 15:00:00 0

2024-04-19 15:20:00 0

Name: anomaly, Length: 562, dtype: int32

 

数据验证步骤

#检查时间连续性
assert df_1min.index.to_series().diff().value_counts().iloc[0] == len(df_1min) - 1

sample = resampled.loc['2024-04-18 09:30:00':'2024-04-18 10:30:00']
assert (sample['High']>=sample[['Open','Close']].max(axis=1)).all()

print(df_1min.index.to_series().diff().value_counts())

len(df_1min)

TradingTime

0 days 00:01:00 557

0 days 06:29:00 1

0 days 00:15:00 1

0 days 02:00:00 1

0 days 00:20:00 1

Name: count, dtype: int64

 

posted @ 2025-03-31 22:09  vhao11  阅读(113)  评论(0)    收藏  举报