已优化列表:
1、部分股票获取分红数据一直失败,每次调用都会重新获取。
1、部分股票获取分红数据一直失败,每次调用都会重新获取。
2、数据全部获取,每次分析目标股票,也需要挨个分析,时间太长
3、报告数量多,挨个打开耗费时间
4、akshare库数据其实是爬虫获取,频繁交互会导致数据处理失败,考虑缓存,以及反爬
最终呈现:
龙头股数据:

高股息分析结果:

调整完代码:可以直接运行
import akshare as ak
import pandas as pd
import time
import random
from datetime import datetime
import traceback
import numpy as np
import os
def main(enable_high_dividend=True):
"""主流程:先获取全市场分红数据到excel,再分析龙头股和高分红股票"""
try:
print("="*50)
print("开始执行股票分析脚本")
print(f"当前时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*50)
# 步骤1: 获取全市场分红数据到excel
print("\n[步骤1/6] 获取全市场分红数据到excel...")
get_dividend_data_for_codes() # 注释掉此步骤时,脚本仍可运行
#print("步骤1已跳过(已注释)")
# 步骤2: 从excel加载分红数据到缓存
print("\n[步骤2/6] 从excel加载分红数据到缓存...")
dividend_cache = load_dividend_cache()
print(f"从缓存加载到 {len(dividend_cache)} 只股票分红数据")
# 步骤3: 获取股票基本信息
print("\n[步骤3/6] 获取股票基本信息...")
stock_basic_df = get_stock_basic_info()
print(f"获取到 {len(stock_basic_df)} 只股票基本信息")
# 步骤4: 获取财务数据
print("\n[步骤4/6] 获取财务数据(市值/营收/利润)...")
financial_df = get_financial_data(stock_basic_df)
# 步骤5: 从excel读取分红数据
print("\n[步骤5/6] 从excel读取分红数据...")
dividend_from_excel = read_dividend_from_excel()
# 步骤6: 数据整合与分析
print("\n[步骤6/6] 数据分析与筛选...")
merged_df = merge_data(stock_basic_df, financial_df, dividend_from_excel)
# 按板块筛选龙头股(不考虑分红率)
top_stocks = []
for industry, group in merged_df.groupby('板块'):
valid_group = group.dropna(subset=['市值(亿)'])
if len(valid_group) == 0:
continue
sorted_group = valid_group.sort_values('市值(亿)', ascending=False)
top_three = sorted_group.head(3)
top_stocks.append(top_three)
if top_stocks:
all_top_stocks = pd.concat(top_stocks).sort_values('板块')
print("\n" + "="*50)
print(f"各板块龙头股分析完成,共筛选出 {len(all_top_stocks)} 只龙头股")
print("="*50 + "\n")
print_top_stocks(all_top_stocks)
# 保存龙头股分析报告到文件
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
top_file = f"龙头股_{timestamp}.csv"
output_columns = ['code', 'name', '板块', '市值(亿)', '营收(亿)', '利润(亿)', '分红率', '*均分红']
all_top_stocks[output_columns].to_csv(top_file, index=False, float_format='%.2f')
print(f"\n龙头股分析报告已保存: {top_file}")
else:
print("未找到龙头股")
# 高分红股票单独输出,放到最后
if enable_high_dividend:
high_dividend_stocks = merged_df[merged_df['分红率'] > 3]
print(f"其中高分红率股票(>3%)有 {len(high_dividend_stocks)} 只")
print_high_dividend_stocks(high_dividend_stocks)
# 保存高分红股票
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
div_file = f"高分红股票_{timestamp}.csv"
output_columns = ['code', 'name', '板块', '分红率', '市值(亿)', '营收(亿)', '利润(亿)', '*均分红']
high_dividend_stocks[output_columns].to_csv(div_file, index=False, float_format='%.2f')
print(f"高分红股票已保存: {div_file}")
except Exception as e:
print("\n" + "!"*50)
print("脚本执行失败!错误信息:")
print(traceback.format_exc())
print("!"*50)
exit(1)
def get_stock_basic_info():
"""获取股票基本信息:代码、名称、所属行业"""
print(" > 获取A股基本信息...")
stock_info_a = ak.stock_info_a_code_name()
time.sleep(2)
print(" > 获取深股基本信息...")
stock_info_sz = ak.stock_info_sz_name_code()
time.sleep(2)
# 合并沪深两市股票
df = pd.concat([
stock_info_a[['code', 'name']],
stock_info_sz[['A股代码', 'A股简称']].rename(columns={'A股代码': 'code', 'A股简称': 'name'})
]).drop_duplicates('code')
print(" > 获取行业分类信息...")
industry_df = ak.stock_board_industry_name_em()
industry_map = {}
for _, row in industry_df.iterrows():
cons_df = ak.stock_board_industry_cons_em(symbol=row['板块名称'])
for code in cons_df['代码']:
industry_map[code] = row['板块名称']
# 添加行业信息
df['板块'] = df['code'].apply(lambda x: industry_map.get(x))
return df.dropna(subset=['板块'])
def get_financial_data(stock_basic_df):
"""批量获取财务数据:市值、营收、利润"""
print(" > 批量获取市值数据...")
spot_df = ak.stock_zh_a_spot_em()
spot_df = spot_df.rename(columns={'代码': 'code', '总市值': '市值(元)'})
time.sleep(6)
print(" > 批量获取利润表数据...")
lrb_df = ak.stock_lrb_em()
lrb_df = lrb_df.sort_values('公告日期', ascending=False).drop_duplicates('股票代码')
# 兼容不同字段名
if '营业总收入' in lrb_df.columns:
revenue_col = '营业总收入'
elif '营业收入' in lrb_df.columns:
revenue_col = '营业收入'
else:
revenue_col = None
lrb_df = lrb_df.rename(columns={'股票代码': 'code'})
if revenue_col:
lrb_df = lrb_df.rename(columns={revenue_col: '营收(元)'})
lrb_df = lrb_df.rename(columns={'净利润': '利润(元)'})
time.sleep(6)
print(" > 合并财务数据...")
df = stock_basic_df[['code']].copy()
df = df.merge(spot_df, on='code', how='left')
df = df.merge(lrb_df, on='code', how='left')
# 单位统一为“亿”
df['市值(亿)'] = df['市值(元)'] / 1e8
df['营收(亿)'] = df['营收(元)'] / 1e8
df['利润(亿)'] = df['利润(元)'] / 1e8
return df[['code', '市值(亿)', '营收(亿)', '利润(亿)']]
def load_dividend_cache():
"""从Excel文件加载分红数据到缓存"""
cache_file = 'dividend_cache.xlsx'
if not os.path.exists(cache_file):
print(f" > 分红数据缓存文件 {cache_file} 不存在,将创建空缓存")
return {}
try:
df = pd.read_excel(cache_file, dtype={'code': str})
df['code'] = df['code'].astype(str)
# 确保所有列都存在,缺失的列填充0
for col in ['*均分红', '分红率']:
if col not in df.columns:
df[col] = 0
# 转换为字典格式,便于快速查找
cache_dict = {}
for _, row in df.iterrows():
cache_dict[row['code']] = {
'*均分红': row.get('*均分红', 0),
'分红率': row.get('分红率', 0)
}
print(f" > 从Excel加载到 {len(cache_dict)} 只股票的分红数据到缓存")
return cache_dict
except Exception as e:
print(f"从Excel加载分红数据失败: {e}")
return {}
def read_dividend_from_excel():
"""从Excel文件读取分红数据"""
cache_file = 'dividend_cache.xlsx'
if not os.path.exists(cache_file):
print(f"错误:分红数据缓存文件 {cache_file} 不存在。请先运行 get_dividend_data_for_codes 获取数据。")
return pd.DataFrame(columns=['code', '*均分红', '分红率'])
try:
df = pd.read_excel(cache_file, dtype={'code': str})
# 验证并确保数据完整性
df = validate_dividend_data(df)
print(f" > 从Excel读取到 {len(df)} 只股票的分红数据")
return df
except Exception as e:
print(f"从Excel读取分红数据失败: {e}")
return pd.DataFrame(columns=['code', '*均分红', '分红率'])
def validate_dividend_data(df):
"""验证分红数据的完整性,确保所有记录都有完整的字段"""
if df.empty:
return df
# 确保所有必需的列都存在
required_columns = ['code', '*均分红', '分红率']
for col in required_columns:
if col not in df.columns:
df[col] = 0
# 确保所有数值字段都是数值类型
df['*均分红'] = pd.to_numeric(df['*均分红'], errors='coerce').fillna(0)
df['分红率'] = pd.to_numeric(df['分红率'], errors='coerce').fillna(0)
# 确保code字段是字符串类型
df['code'] = df['code'].astype(str)
return df
def get_dividend_data_for_codes(year_start=None, year_end=None):
"""获取全市场A股分红数据到Excel缓存文件,分批处理,支持断续执行"""
import os
cache_file = 'dividend_cache.xlsx'
dividend_data = []
current_year = datetime.now().year
if year_start is None:
year_start = current_year - 3
if year_end is None:
year_end = current_year
# 获取全部A股代码
price_df = ak.stock_zh_a_spot_em()
code_list = list(price_df['代码'].astype(str))
price_map = dict(zip(price_df['代码'], price_df['最新价']))
# 读取缓存
if os.path.exists(cache_file):
cache_df = pd.read_excel(cache_file, dtype={'code': str})
cache_df['code'] = cache_df['code'].astype(str)
else:
cache_df = pd.DataFrame(columns=['code', '*均分红', '分红率'])
cache_map = {row['code']: row for _, row in cache_df.iterrows()}
new_records = []
failed = 0
batch_size = 25 # 每批处理25只股票
total = len(code_list)
call_count = 0 # 调用计数器
print(f" > 开始获取分红数据,A股总数: {total} 区间: {year_start}~{year_end}")
for batch_start in range(0, total, batch_size):
batch_codes = code_list[batch_start:batch_start+batch_size]
print(f"\n==== 处理第 {batch_start+1} ~ {min(batch_start+batch_size, total)} 只股票 ====")
batch_new_records = []
for idx, code_str in enumerate(batch_codes):
print(f"正在处理的股票编号是: {code_str} (全市场第 {batch_start+idx+1}/{total})")
# 优先用缓存,如果表格中有数据则跳过并打印
if code_str in cache_map:
print(f" > 从缓存中找到 {code_str} 的数据,跳过获取")
cache_record = cache_map[code_str]
# 确保缓存数据有完整的字段
dividend_data.append({
'code': code_str,
'*均分红': cache_record.get('*均分红', 0),
'分红率': cache_record.get('分红率', 0)
})
continue
# 检查是否需要间隔60秒(每调用30次)
if call_count > 0 and call_count % 30 == 0:
print(f" > 已调用 {call_count} 次,等待60秒...")
time.sleep(60)
max_retries = 1
retry_count = 0
success = False
while not success and retry_count < max_retries:
try:
# 随机间隔3到5秒
delay = random.uniform(3, 5)
time.sleep(delay)
div_df = ak.stock_history_dividend_detail(symbol=code_str, indicator="分红")
call_count += 1 # 增加调用计数
if div_df.empty:
# 没有分红数据,插入分红为0的记录
record = {
'code': code_str,
'*均分红': 0,
'分红率': 0
}
dividend_data.append(record)
batch_new_records.append(record)
new_records.append(record)
success = True
continue
div_df = div_df[div_df['公告日期'].astype(str).str[:4].astype(int).between(year_start, year_end)]
if not div_df.empty:
div_df['year'] = div_df['公告日期'].astype(str).str[:4].astype(int)
year_sum = div_df.groupby('year')['派息'].sum()
years = list(range(year_start, year_end+1))
year_sum = year_sum.reindex(years, fill_value=0)
avg_div = year_sum.sum() / len(years) / 10
current_price = price_map.get(code_str, 0)
div_rate = (avg_div / current_price * 100) if current_price > 0 else 0
record = {
'code': code_str,
'*均分红': avg_div,
'分红率': div_rate
}
dividend_data.append(record)
batch_new_records.append(record)
new_records.append(record)
success = True
else:
# 指定年份范围内没有分红数据,插入分红为0的记录
record = {
'code': code_str,
'*均分红': 0,
'分红率': 0
}
dividend_data.append(record)
batch_new_records.append(record)
new_records.append(record)
success = True
except Exception as e:
retry_count += 1
error_msg = str(e)[:100]
print(f" {code_str} 分红数据获取失败 ({retry_count}/{max_retries}) 错误: {error_msg}")
if retry_count < max_retries:
# 指数退避策略,避免频繁重试
wait_time = min(30, 2 ** retry_count + random.uniform(1, 5))
print(f" > 等待 {wait_time:.1f} 秒后重试...")
time.sleep(wait_time)
else:
print(f" {code_str} 分红数据获取最终失败,插入分红为0的记录")
# 获取失败时也插入分红为0的记录
record = {
'code': code_str,
'*均分红': 0,
'分红率': 0
}
dividend_data.append(record)
batch_new_records.append(record)
new_records.append(record)
failed += 1
# 失败后额外等待,避免连续失败
time.sleep(random.uniform(5, 10))
# 每25只保存一次
if batch_new_records:
# 确保所有记录都有完整的字段
batch_df = pd.DataFrame(batch_new_records)
# 确保所有必需的列都存在
for col in ['code', '*均分红', '分红率']:
if col not in batch_df.columns:
batch_df[col] = 0
all_df = pd.concat([cache_df, batch_df], ignore_index=True)
all_df.drop_duplicates('code', keep='last', inplace=True)
# 验证并确保数据完整性
all_df = validate_dividend_data(all_df)
all_df.to_excel(cache_file, index=False)
cache_df = all_df
cache_map = {row['code']: row for _, row in cache_df.iterrows()}
print(f" > 本批新分红数据已保存到 {cache_file}")
print(f"---- 本批处理完毕,已处理 {min(batch_start+batch_size, total)}/{total} 只 ----\n")
# 合并新数据并保存缓存
if new_records:
# 确保所有记录都有完整的字段
new_df = pd.DataFrame(new_records)
# 确保所有必需的列都存在
for col in ['code', '*均分红', '分红率']:
if col not in new_df.columns:
new_df[col] = 0
all_df = pd.concat([cache_df, new_df], ignore_index=True)
all_df.drop_duplicates('code', keep='last', inplace=True)
# 验证并确保数据完整性
all_df = validate_dividend_data(all_df)
all_df.to_excel(cache_file, index=False)
print(f" > 新分红数据已追加保存到 {cache_file}")
else:
print(f" > 未有新分红数据写入缓存 {cache_file}")
print(f" > 分红数据获取完成,成功: {len(dividend_data)},失败: {failed}")
# 返回最终的数据框,确保所有记录都有完整字段
final_df = pd.DataFrame(dividend_data)
final_df = validate_dividend_data(final_df)
return final_df
def merge_data(basic_df, financial_df, dividend_df):
"""合并所有数据源"""
merged = basic_df.merge(financial_df, on='code', how='left')
merged = merged.merge(dividend_df, on='code', how='left')
# 处理缺失值
merged['分红率'] = merged['分红率'].fillna(0)
merged['*均分红'] = merged['*均分红'].fillna(0)
# 过滤无效数据
merged = merged.dropna(subset=['市值(亿)', '营收(亿)', '利润(亿)'])
merged = merged[merged['市值(亿)'] > 0] # 排除市值为0的股票
return merged
def get_industry_top_stocks(df):
"""按行业筛选市值前三的龙头股"""
top_stocks = []
for industry, group in df.groupby('板块'):
# 排除无财务数据的股票
valid_group = group.dropna(subset=['市值(亿)'])
if len(valid_group) == 0:
continue
# 按市值降序排序
sorted_group = valid_group.sort_values('市值(亿)', ascending=False)
# 取前三名
top_three = sorted_group.head(3)
top_stocks.append(top_three)
return pd.concat(top_stocks).sort_values('板块')
def print_top_stocks(df):
"""打印龙头股报告"""
print("\n" + "="*50)
print("各板块龙头股报告:")
print("="*50)
for industry, group in df.groupby('板块'):
print(f"\n【{industry}】板块龙头股:")
for _, row in group.iterrows():
print(f" {row['code']} {row['name']} | "
f"市值:{row['市值(亿)']:.1f}亿 | "
f"分红率:{row['分红率']:.2f}% | "
f"营收:{row['营收(亿)']:.1f}亿 | "
f"利润:{row['利润(亿)']:.1f}亿")
def print_high_dividend_stocks(df):
"""打印高分红股票报告"""
if not df.empty:
print("\n" + "="*50)
print("高分红率股票(>3%):")
print("="*50)
for _, row in df.iterrows():
print(f" {row['code']} {row['name']} ({row['板块']}) | "
f"分红率:{row['分红率']:.2f}% | "
f"市值:{row['市值(亿)']:.1f}亿")
else:
print("\n未找到分红率>3%的股票")
def save_results(top_stocks, high_dividend_stocks):
"""保存结果到CSV文件"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
top_file = f"龙头股分析_{timestamp}.csv"
div_file = f"高分红股票_{timestamp}.csv"
# 优化输出格式
output_columns = ['code', 'name', '板块', '分红率', '市值(亿)', '营收(亿)', '利润(亿)', '*均分红']
top_stocks[output_columns].to_csv(top_file, index=False, float_format='%.2f')
high_dividend_stocks[output_columns].to_csv(div_file, index=False, float_format='%.2f')
print("\n" + "="*50)
print(f"结果已保存: {top_file}, {div_file}")
print("="*50)
if __name__ == "__main__":
main()
# 如需单独获取分红数据,可取消注释下面这行
# get_dividend_data_for_codes()
叠加分析:
1、超跌股票
2、超跌近底股票

具体内容就不展示了,省得坑害别人~
开开心心每一天,写代码的小熊猫~~