使用Pandas处理时间序列数据(对齐K线、重采样、滚动计算)
数据模拟和加载
import pandas as pd import numpy as np # Load the data data = pd.read_csv('D:/data/AG2406.csv') # Preprocess the data data['TradingTime'] = pd.to_datetime(data['TradingTime']) data['Volume'] = data['TotalVolume'] - data['TotalVolume'].shift(1).fillna(0) data['Open'] = data['LastPrice'] data['High'] = data['LastPrice'] data['Low'] = data['LastPrice'] # Rename the columns data=data.rename(columns={'LastPrice':'Close'}) df_seseco = data.set_index('TradingTime') # [Symbol, Exchange, TradingDate, TradingTime, OpenPrice, LastPrice, HighPrice, LowPrice, BuyPrice01, BuyPrice02, BuyPrice03, BuyPrice04, BuyPrice05, SellPrice01, SellPrice02, SellPrice03, SellPrice04, SellPrice05, BuyVolume01, BuyVolume02, BuyVolume03, BuyVolume04, BuyVolume05, SellVolume01, SellVolume02, SellVolume03, SellVolume04, SellVolume05, PriceUpLimit, PriceDownLimit, TotalVolume, PreTotalPosition, TotalPosition, Turnover, SettlePrice, PreSettlePrice, ClosePrice, PreClosePrice]
df_1min = df_seseco.resample('1min').agg({'Open': 'first', 'High':'max', 'Low':'min', 'Close': 'last', 'Volume':'sum'}) df_1min.dropna(inplace=True) #df_1min.head(100) df_5min = df_1min.resample('5T').agg({'Open': 'first', 'High':'max', 'Low':'min', 'Close': 'last', 'Volume':'sum'}) df_5min.dropna(inplace=True)
重采样
# 将1分钟数据重采样为15分钟K线 resampled = df_1min.resample('15T').agg({'Open': 'first', 'High':'max', 'Low':'min', 'Close': 'last', 'Volume':'sum'}) # 前向填充缺失值(限连续缺失不超过3个) resampled = resampled.fillna(method='ffill', limit=3)
绘制k线图
import matplotlib.pyplot as plt import mplfinance as mpf import matplotlib.dates as mdates mpf.plot(df_1min,type='candle',style='charles',title='1分钟K线图',ylabel='价格',ylabel_lower='成交量',show_nontrading=False,volume=True) mpf.plot(df_5min,type='candle',style='charles',title='5分钟K线图',ylabel='价格',ylabel_lower='成交量',volume=True,show_nontrading=False) import pandas as pd import numpy as np import matplotlib.pyplot as plt import mplfinance as mpf import matplotlib.dates as mdates # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体显示中文 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题 # Plotting the candlestick chart fig, ax = plt.subplots(figsize=(14, 7)) # 调整图表大小 mpf.plot(df_1min, type='candle', ax=ax, style='charles', ylabel='价格', title='K线图') # Remove the gaps for non-trading hours (处理休市期间的空白) ax.xaxis.set_major_locator(mdates.HourLocator(interval=1)) # 设置x轴间隔为1小时 ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M')) # 设置x轴格式 # Rotate date labels for better readability (旋转日期标签以便更好地阅读) plt.xticks(rotation=45) # Display the plot plt.show()
# 合并不同频率数据 时间序列对齐
# 合并不同频率数据 aligned_df = pd.merge_asof(df_1min.reset_index(), df_5min.reset_index(), on='TradingTime', suffixes=('_1min', '_5min')) print(aligned_df)
滚动特征计算
# 简单移动平均(等权重)
df['Volume'].rolling(26).mean()
# 指数加权平均(衰减权重)
df['Volume'].ewm(span=26).mean()
#双均线 df_1min['MA20'] = df_1min['Close'].rolling(window=20).mean() df_1min['MA60'] = df_1min['Close'].rolling(window=60).mean() #波动率计算(基于一小时窗口) df_1min['hourly_volatility'] = df_1min['Close'].rolling(window=60).std() * np.sqrt(60) # 年化波动率 # 带衰减因子的成交量加权平均 df_1min['volume_EMA'] = ( df_1min['Volume'] .ewm(span=26, adjust=False) .mean() )
5. 时间序列可视化验证
import matplotlib.pyplot as plt fix, ax = plt.subplots(4,1,figsize=(16,12)) df_1min[['Close','MA20','MA60']].plot(ax=ax[0],title='price & moving averages') df_1min['hourly_volatility'].plot(ax=ax[1],title='hourly volatility') df_1min['volume_EMA'].plot(ax=ax[2],title='volume EMA') df_5min['Close'].plot(ax=ax[3],title='price 5min') plt.tight_layout() plt.show()
异常值检测
#基于布林带识别异常波动 rolling_mean = df_1min['Close'].rolling(20).mean() rolling_std= df_1min['Close'].rolling(20).std() df_1min['Upper'] = rolling_mean + 2*rolling_std df_1min['Lower'] = rolling_mean - 2*rolling_std #标记突破布林带异常点 df_1min['anomaly'] = np.where((df_1min['Close'] > df_1min['Upper']) | (df_1min['Close'] < df_1min['Lower']), 1, 0) df_1min['anomaly']
TradingTime
2024-04-18 20:59:00 0
2024-04-18 21:00:00 0
2024-04-18 21:01:00 0
2024-04-18 21:02:00 0
2024-04-18 21:03:00 0
..
2024-04-19 14:57:00 1
2024-04-19 14:58:00 1
2024-04-19 14:59:00 1
2024-04-19 15:00:00 0
2024-04-19 15:20:00 0
Name: anomaly, Length: 562, dtype: int32
数据验证步骤
#检查时间连续性 assert df_1min.index.to_series().diff().value_counts().iloc[0] == len(df_1min) - 1 sample = resampled.loc['2024-04-18 09:30:00':'2024-04-18 10:30:00'] assert (sample['High']>=sample[['Open','Close']].max(axis=1)).all() print(df_1min.index.to_series().diff().value_counts()) len(df_1min)
TradingTime
0 days 00:01:00 557
0 days 06:29:00 1
0 days 00:15:00 1
0 days 02:00:00 1
0 days 00:20:00 1
Name: count, dtype: int64

浙公网安备 33010602011771号