如何编写自动抓取基金公告或净值文件的Python代码示例
我来提供一个实用的Python代码框架,用于自动抓取基金公告和净值文件,以满足LOF套利策略的数据需求。
📋 完整抓取方案框架
1. 抓取基金官方净值文件
以下是抓取国投瑞银基金净值的示例代码:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
import json
class FundDataFetcher:
def __init__(self, fund_code="161226"):
self.fund_code = fund_code
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
def fetch_official_nav(self, date_str=None):
"""
从官方指定平台获取基金净值
"""
if not date_str:
date_str = datetime.now().strftime("%Y-%m-%d")
try:
# 方法1:尝试从巨潮资讯网获取(官方指定披露平台)
url = f"http://www.cninfo.com.cn/new/disclosure/stock?stockCode={self.fund_code}"
response = requests.get(url, headers=self.headers, timeout=10)
# 解析HTML获取净值数据(实际需根据页面结构调整)
# 这里展示从API获取的示例
api_url = f"http://api.fund.eastmoney.com/f10/lsjz?fundCode={self.fund_code}&pageIndex=1&pageSize=1"
api_headers = {
**self.headers,
'Referer': 'http://fundf10.eastmoney.com/'
}
response = requests.get(api_url, headers=api_headers, timeout=10)
data = response.json()
if data['Data'] and data['Data']['LSJZList']:
nav_data = data['Data']['LSJZList'][0]
return {
'date': nav_data['FSRQ'],
'nav': float(nav_data['DWJZ']),
'accumulated_nav': float(nav_data['LJJZ'])
}
except Exception as e:
print(f"获取净值失败: {e}")
return None
def fetch_nav_history(self, days=30):
"""获取历史净值数据"""
url = f"http://api.fund.eastmoney.com/f10/lsjz?fundCode={self.fund_code}&pageIndex=1&pageSize={days}"
headers = {
**self.headers,
'Referer': 'http://fundf10.eastmoney.com/'
}
try:
response = requests.get(url, headers=headers, timeout=10)
data = response.json()
nav_list = []
if data['Data'] and data['Data']['LSJZList']:
for item in data['Data']['LSJZList']:
nav_list.append({
'date': item['FSRQ'],
'nav': float(item['DWJZ']),
'change': item.get('JZZZL', '0%')
})
return pd.DataFrame(nav_list)
except Exception as e:
print(f"获取历史净值失败: {e}")
return pd.DataFrame()
2. 抓取基金公告与申购状态
class FundAnnouncementFetcher:
def __init__(self, fund_code="161226"):
self.fund_code = fund_code
self.base_url = "http://www.cninfo.com.cn/new/disclosure"
def fetch_latest_announcements(self, limit=10):
"""
获取最新基金公告
重点关注:申购赎回状态、额度限制、基金合同修改等
"""
try:
# 构建查询参数
params = {
'fundCode': self.fund_code,
'pageNum': 1,
'pageSize': limit,
'category': '基金公告'
}
response = requests.get(
f"{self.base_url}/query",
params=params,
headers={'User-Agent': 'Mozilla/5.0'},
timeout=15
)
announcements = []
if response.status_code == 200:
data = response.json()
for item in data.get('announcements', []):
# 筛选关键公告类型
title = item.get('title', '')
if any(keyword in title for keyword in
['申购', '赎回', '限购', '额度', '暂停', '恢复']):
announcements.append({
'title': title,
'date': item.get('date'),
'url': f"{self.base_url}/download/{item.get('id')}",
'content_preview': item.get('content', '')[:200]
})
return announcements
except Exception as e:
print(f"获取公告失败: {e}")
return []
def check_subscription_status(self):
"""
专项检查申购限制状态
返回当前申购额度、是否暂停等信息
"""
announcements = self.fetch_latest_announcements(limit=20)
# 分析公告内容,提取申购限制信息
current_status = {
'can_subscribe': True,
'daily_limit': None, # 每日限额
'last_update': None,
'reason': '正常'
}
for ann in announcements:
title = ann['title'].lower()
content = ann['content_preview'].lower()
# 检查暂停申购公告
if '暂停申购' in title or '暂停申购' in content:
current_status['can_subscribe'] = False
current_status['reason'] = '已暂停申购'
current_status['last_update'] = ann['date']
break
# 检查限购公告
elif '限购' in title or '限额' in title:
# 提取限额金额(简化示例,实际需更复杂的文本解析)
import re
amount_pattern = r'(\d+,?\d*)元'
matches = re.search(amount_pattern, ann['title'] + content)
if matches:
current_status['daily_limit'] = matches.group(1)
current_status['reason'] = f"每日限购{current_status['daily_limit']}元"
current_status['last_update'] = ann['date']
return current_status
3. 与QMT集成的数据服务模块
class QMTDataService:
"""将抓取的数据集成到QMT策略中"""
def __init__(self, fund_code="161226"):
self.fund_code = fund_code
self.nav_fetcher = FundDataFetcher(fund_code)
self.ann_fetcher = FundAnnouncementFetcher(fund_code)
self.cache_file = f"{fund_code}_nav_cache.json"
def get_trading_nav_data(self):
"""
交易日获取最新净值数据,供QMT策略调用
"""
# 1. 检查缓存
cached_data = self._load_cached_data()
today = datetime.now().strftime("%Y-%m-%d")
if cached_data and cached_data.get('date') == today:
return cached_data
# 2. 获取最新数据
nav_data = self.nav_fetcher.fetch_official_nav()
# 3. 获取申购状态
sub_status = self.ann_fetcher.check_subscription_status()
# 4. 合并数据并缓存
trading_data = {
**nav_data,
'subscription_status': sub_status,
'update_time': datetime.now().isoformat(),
'fund_code': self.fund_code
}
self._save_to_cache(trading_data)
return trading_data
def _load_cached_data(self):
"""加载缓存数据"""
try:
with open(self.cache_file, 'r', encoding='utf-8') as f:
return json.load(f)
except:
return None
def _save_to_cache(self, data):
"""保存数据到缓存"""
with open(self.cache_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def should_trigger_arbitrage(self, market_price):
"""
综合判断是否触发套利
"""
data = self.get_trading_nav_data()
if not data or 'nav' not in data:
return {'should_trigger': False, 'reason': '净值数据缺失'}
# 检查申购状态
sub_status = data['subscription_status']
if not sub_status['can_subscribe']:
return {'should_trigger': False, 'reason': f"无法申购: {sub_status['reason']}"}
# 计算溢价率
premium_rate = (market_price - data['nav']) / data['nav']
# 考虑交易成本(示例值)
total_cost_rate = 0.006 # 0.6%
# 判断逻辑
if premium_rate > total_cost_rate:
return {
'should_trigger': True,
'direction': 'premium_arb',
'premium_rate': premium_rate,
'nav': data['nav'],
'market_price': market_price,
'daily_limit': sub_status.get('daily_limit')
}
elif premium_rate < -total_cost_rate:
return {
'should_trigger': True,
'direction': 'discount_arb',
'premium_rate': premium_rate
}
return {'should_trigger': False, 'reason': '溢价率未达阈值'}
# 在QMT策略中的使用方式
def initialize(context):
# 初始化数据服务
context.data_service = QMTDataService(fund_code="161226")
# 设置套利参数
context.min_premium = 0.015 # 1.5%最小溢价率
context.check_interval = 60 # 每60秒检查一次
def handle_data(context, data):
# 获取当前市价(从QMT)
market_price = get_market_data(
['close'],
stock_code=['161226.SZ'],
period='1m',
count=1
)['close'].iloc[-1]
# 获取净值并判断套利机会
decision = context.data_service.should_trigger_arbitrage(market_price)
if decision['should_trigger']:
log.info(f"发现套利机会: {decision}")
# 检查申购限额
if decision.get('direction') == 'premium_arb':
daily_limit = decision.get('daily_limit')
if daily_limit and daily_limit < 1000:
log.warning(f"申购限额过低: {daily_limit}元,可能无法有效套利")
return
# 执行套利逻辑(调用之前的套利函数)
execute_arbitrage(context, decision)
4. 实际使用示例
⚠️ 关键注意事项
-
法律合规性
-
检查目标网站的
robots.txt文件,尊重爬取频率限制 -
添加适当的请求间隔(如
time.sleep(1)) -
仅用于个人研究,避免商业用途
-
-
数据源稳定性
-
官方平台(如巨潮资讯)是最可靠的数据源
-
东方财富等第三方API可能有频率限制
-
实现多数据源备选机制
-
-
错误处理
-
增加重试机制和超时设置
-
实现数据验证(检查净值范围合理性)
-
记录日志以便调试
-
-
性能优化
-
缓存频繁访问的数据
-
使用异步请求提高效率
-
定期清理历史数据
-
🔄 完整工作流程
class QMTDataService:
"""将抓取的数据集成到QMT策略中"""
def __init__(self, fund_code="161226"):
self.fund_code = fund_code
self.nav_fetcher = FundDataFetcher(fund_code)
self.ann_fetcher = FundAnnouncementFetcher(fund_code)
self.cache_file = f"{fund_code}_nav_cache.json"
def get_trading_nav_data(self):
"""
交易日获取最新净值数据,供QMT策略调用
"""
# 1. 检查缓存
cached_data = self._load_cached_data()
today = datetime.now().strftime("%Y-%m-%d")
if cached_data and cached_data.get('date') == today:
return cached_data
# 2. 获取最新数据
nav_data = self.nav_fetcher.fetch_official_nav()
# 3. 获取申购状态
sub_status = self.ann_fetcher.check_subscription_status()
# 4. 合并数据并缓存
trading_data = {
**nav_data,
'subscription_status': sub_status,
'update_time': datetime.now().isoformat(),
'fund_code': self.fund_code
}
self._save_to_cache(trading_data)
return trading_data
def _load_cached_data(self):
"""加载缓存数据"""
try:
with open(self.cache_file, 'r', encoding='utf-8') as f:
return json.load(f)
except:
return None
def _save_to_cache(self, data):
"""保存数据到缓存"""
with open(self.cache_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def should_trigger_arbitrage(self, market_price):
"""
综合判断是否触发套利
"""
data = self.get_trading_nav_data()
if not data or 'nav' not in data:
return {'should_trigger': False, 'reason': '净值数据缺失'}
# 检查申购状态
sub_status = data['subscription_status']
if not sub_status['can_subscribe']:
return {'should_trigger': False, 'reason': f"无法申购: {sub_status['reason']}"}
# 计算溢价率
premium_rate = (market_price - data['nav']) / data['nav']
# 考虑交易成本(示例值)
total_cost_rate = 0.006 # 0.6%
# 判断逻辑
if premium_rate > total_cost_rate:
return {
'should_trigger': True,
'direction': 'premium_arb',
'premium_rate': premium_rate,
'nav': data['nav'],
'market_price': market_price,
'daily_limit': sub_status.get('daily_limit')
}
elif premium_rate < -total_cost_rate:
return {
'should_trigger': True,
'direction': 'discount_arb',
'premium_rate': premium_rate
}
return {'should_trigger': False, 'reason': '溢价率未达阈值'}
# 主程序示例
def main():
# 1. 初始化
fetcher = QMTDataService("161226")
# 2. 检查申购状态
status = fetcher.ann_fetcher.check_subscription_status()
print(f"当前申购状态: {status}")
# 3. 获取最新净值
nav_data = fetcher.get_trading_nav_data()
print(f"最新净值: {nav_data}")
# 4. 判断套利机会(假设当前市价2.1元)
decision = fetcher.should_trigger_arbitrage(market_price=2.1)
print(f"套利决策: {decision}")
if __name__ == "__main__":
main()
# 主程序示例
def main():
# 1. 初始化
fetcher = QMTDataService("161226")
# 2. 检查申购状态
status = fetcher.ann_fetcher.check_subscription_status()
print(f"当前申购状态: {status}")
# 3. 获取最新净值
nav_data = fetcher.get_trading_nav_data()
print(f"最新净值: {nav_data}")
# 4. 判断套利机会(假设当前市价2.1元)
decision = fetcher.should_trigger_arbitrage(market_price=2.1)
print(f"套利决策: {decision}")
if __name__ == "__main__":
main()
📊 后续优化建议
-
增加数据源:同时监控多个数据源,提高可靠性
-
实时监控:使用WebSocket或定时轮询,及时发现公告更新
-
机器学习:对公告文本进行自然语言处理,自动识别限制条件
-
集成到QMT:将数据服务作为QMT策略的常驻模块
这个框架提供了从数据抓取到决策判断的完整流程。实际使用时,你需要根据目标网站的实际HTML结构调整解析逻辑,并充分测试各种异常情况。

浙公网安备 33010602011771号