已优化列表:
1、部分股票获取分红数据一直失败,每次调用都会重新获取。

2、数据全部获取,每次分析目标股票,也需要挨个分析,时间太长

3、报告数量多,挨个打开耗费时间

4、akshare库数据其实是爬虫获取,频繁交互会导致数据处理失败,考虑缓存,以及反爬

 

最终呈现:

龙头股数据:

image

 

 

高股息分析结果:

image

 

调整完代码:可以直接运行

import akshare as ak
import pandas as pd
import time
import random
from datetime import datetime
import traceback
import numpy as np
import os

def main(enable_high_dividend=True):
    """主流程:先获取全市场分红数据到excel,再分析龙头股和高分红股票"""
    try:
        print("="*50)
        print("开始执行股票分析脚本")
        print(f"当前时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("="*50)
        
        # 步骤1: 获取全市场分红数据到excel
        print("\n[步骤1/6] 获取全市场分红数据到excel...")
        get_dividend_data_for_codes()  # 注释掉此步骤时,脚本仍可运行
        #print("步骤1已跳过(已注释)")
        
        # 步骤2: 从excel加载分红数据到缓存
        print("\n[步骤2/6] 从excel加载分红数据到缓存...")
        dividend_cache = load_dividend_cache()
        print(f"从缓存加载到 {len(dividend_cache)} 只股票分红数据")
        
        # 步骤3: 获取股票基本信息
        print("\n[步骤3/6] 获取股票基本信息...")
        stock_basic_df = get_stock_basic_info()
        print(f"获取到 {len(stock_basic_df)} 只股票基本信息")
        
        # 步骤4: 获取财务数据
        print("\n[步骤4/6] 获取财务数据(市值/营收/利润)...")
        financial_df = get_financial_data(stock_basic_df)
        
        # 步骤5: 从excel读取分红数据
        print("\n[步骤5/6] 从excel读取分红数据...")
        dividend_from_excel = read_dividend_from_excel()
        
        # 步骤6: 数据整合与分析
        print("\n[步骤6/6] 数据分析与筛选...")
        merged_df = merge_data(stock_basic_df, financial_df, dividend_from_excel)
        
        # 按板块筛选龙头股(不考虑分红率)
        top_stocks = []
        for industry, group in merged_df.groupby('板块'):
            valid_group = group.dropna(subset=['市值(亿)'])
            if len(valid_group) == 0:
                continue
            sorted_group = valid_group.sort_values('市值(亿)', ascending=False)
            top_three = sorted_group.head(3)
            top_stocks.append(top_three)
        if top_stocks:
            all_top_stocks = pd.concat(top_stocks).sort_values('板块')
            print("\n" + "="*50)
            print(f"各板块龙头股分析完成,共筛选出 {len(all_top_stocks)} 只龙头股")
            print("="*50 + "\n")
            print_top_stocks(all_top_stocks)
            
            # 保存龙头股分析报告到文件
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            top_file = f"龙头股_{timestamp}.csv"
            output_columns = ['code', 'name', '板块', '市值(亿)', '营收(亿)', '利润(亿)', '分红率', '*均分红']
            all_top_stocks[output_columns].to_csv(top_file, index=False, float_format='%.2f')
            print(f"\n龙头股分析报告已保存: {top_file}")
        else:
            print("未找到龙头股")
        
        # 高分红股票单独输出,放到最后
        if enable_high_dividend:
            high_dividend_stocks = merged_df[merged_df['分红率'] > 3]
            print(f"其中高分红率股票(>3%)有 {len(high_dividend_stocks)} 只")
            print_high_dividend_stocks(high_dividend_stocks)
            # 保存高分红股票
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            div_file = f"高分红股票_{timestamp}.csv"
            output_columns = ['code', 'name', '板块', '分红率', '市值(亿)', '营收(亿)', '利润(亿)', '*均分红']
            high_dividend_stocks[output_columns].to_csv(div_file, index=False, float_format='%.2f')
            print(f"高分红股票已保存: {div_file}")
    except Exception as e:
        print("\n" + "!"*50)
        print("脚本执行失败!错误信息:")
        print(traceback.format_exc())
        print("!"*50)
        exit(1)

def get_stock_basic_info():
    """获取股票基本信息:代码、名称、所属行业"""
    print("  > 获取A股基本信息...")
    stock_info_a = ak.stock_info_a_code_name()
    time.sleep(2)
    print("  > 获取深股基本信息...")
    stock_info_sz = ak.stock_info_sz_name_code()
    time.sleep(2)
    
    # 合并沪深两市股票
    df = pd.concat([
        stock_info_a[['code', 'name']],
        stock_info_sz[['A股代码', 'A股简称']].rename(columns={'A股代码': 'code', 'A股简称': 'name'})
    ]).drop_duplicates('code')
    
    print("  > 获取行业分类信息...")
    industry_df = ak.stock_board_industry_name_em()
    industry_map = {}
    for _, row in industry_df.iterrows():
        cons_df = ak.stock_board_industry_cons_em(symbol=row['板块名称'])
        for code in cons_df['代码']:
            industry_map[code] = row['板块名称']
    
    # 添加行业信息
    df['板块'] = df['code'].apply(lambda x: industry_map.get(x))
    return df.dropna(subset=['板块'])


def get_financial_data(stock_basic_df):
    """批量获取财务数据:市值、营收、利润"""
    print("  > 批量获取市值数据...")
    spot_df = ak.stock_zh_a_spot_em()
    spot_df = spot_df.rename(columns={'代码': 'code', '总市值': '市值(元)'})
    time.sleep(6)

    print("  > 批量获取利润表数据...")
    lrb_df = ak.stock_lrb_em()
    lrb_df = lrb_df.sort_values('公告日期', ascending=False).drop_duplicates('股票代码')
    # 兼容不同字段名
    if '营业总收入' in lrb_df.columns:
        revenue_col = '营业总收入'
    elif '营业收入' in lrb_df.columns:
        revenue_col = '营业收入'
    else:
        revenue_col = None
    lrb_df = lrb_df.rename(columns={'股票代码': 'code'})
    if revenue_col:
        lrb_df = lrb_df.rename(columns={revenue_col: '营收(元)'})
    lrb_df = lrb_df.rename(columns={'净利润': '利润(元)'})
    time.sleep(6)

    print("  > 合并财务数据...")
    df = stock_basic_df[['code']].copy()
    df = df.merge(spot_df, on='code', how='left')
    df = df.merge(lrb_df, on='code', how='left')

    # 单位统一为“亿”
    df['市值(亿)'] = df['市值(元)'] / 1e8
    df['营收(亿)'] = df['营收(元)'] / 1e8
    df['利润(亿)'] = df['利润(元)'] / 1e8

    return df[['code', '市值(亿)', '营收(亿)', '利润(亿)']]


def load_dividend_cache():
    """从Excel文件加载分红数据到缓存"""
    cache_file = 'dividend_cache.xlsx'
    if not os.path.exists(cache_file):
        print(f"  > 分红数据缓存文件 {cache_file} 不存在,将创建空缓存")
        return {}
    
    try:
        df = pd.read_excel(cache_file, dtype={'code': str})
        df['code'] = df['code'].astype(str)
        # 确保所有列都存在,缺失的列填充0
        for col in ['*均分红', '分红率']:
            if col not in df.columns:
                df[col] = 0
        
        # 转换为字典格式,便于快速查找
        cache_dict = {}
        for _, row in df.iterrows():
            cache_dict[row['code']] = {
                '*均分红': row.get('*均分红', 0),
                '分红率': row.get('分红率', 0)
            }
        print(f"  > 从Excel加载到 {len(cache_dict)} 只股票的分红数据到缓存")
        return cache_dict
    except Exception as e:
        print(f"从Excel加载分红数据失败: {e}")
        return {}

def read_dividend_from_excel():
    """从Excel文件读取分红数据"""
    cache_file = 'dividend_cache.xlsx'
    if not os.path.exists(cache_file):
        print(f"错误:分红数据缓存文件 {cache_file} 不存在。请先运行 get_dividend_data_for_codes 获取数据。")
        return pd.DataFrame(columns=['code', '*均分红', '分红率'])
    
    try:
        df = pd.read_excel(cache_file, dtype={'code': str})
        # 验证并确保数据完整性
        df = validate_dividend_data(df)
        print(f"  > 从Excel读取到 {len(df)} 只股票的分红数据")
        return df
    except Exception as e:
        print(f"从Excel读取分红数据失败: {e}")
        return pd.DataFrame(columns=['code', '*均分红', '分红率'])

def validate_dividend_data(df):
    """验证分红数据的完整性,确保所有记录都有完整的字段"""
    if df.empty:
        return df
    
    # 确保所有必需的列都存在
    required_columns = ['code', '*均分红', '分红率']
    for col in required_columns:
        if col not in df.columns:
            df[col] = 0
    
    # 确保所有数值字段都是数值类型
    df['*均分红'] = pd.to_numeric(df['*均分红'], errors='coerce').fillna(0)
    df['分红率'] = pd.to_numeric(df['分红率'], errors='coerce').fillna(0)
    
    # 确保code字段是字符串类型
    df['code'] = df['code'].astype(str)
    
    return df

def get_dividend_data_for_codes(year_start=None, year_end=None):
    """获取全市场A股分红数据到Excel缓存文件,分批处理,支持断续执行"""
    import os
    cache_file = 'dividend_cache.xlsx'
    dividend_data = []
    current_year = datetime.now().year
    if year_start is None:
        year_start = current_year - 3
    if year_end is None:
        year_end = current_year

    # 获取全部A股代码
    price_df = ak.stock_zh_a_spot_em()
    code_list = list(price_df['代码'].astype(str))
    price_map = dict(zip(price_df['代码'], price_df['最新价']))

    # 读取缓存
    if os.path.exists(cache_file):
        cache_df = pd.read_excel(cache_file, dtype={'code': str})
        cache_df['code'] = cache_df['code'].astype(str)
    else:
        cache_df = pd.DataFrame(columns=['code', '*均分红', '分红率'])

    cache_map = {row['code']: row for _, row in cache_df.iterrows()}
    new_records = []
    failed = 0
    batch_size = 25  # 每批处理25只股票
    total = len(code_list)
    call_count = 0  # 调用计数器
    print(f"  > 开始获取分红数据,A股总数: {total} 区间: {year_start}~{year_end}")

    for batch_start in range(0, total, batch_size):
        batch_codes = code_list[batch_start:batch_start+batch_size]
        print(f"\n==== 处理第 {batch_start+1} ~ {min(batch_start+batch_size, total)} 只股票 ====")
        batch_new_records = []
        for idx, code_str in enumerate(batch_codes):
            print(f"正在处理的股票编号是: {code_str}  (全市场第 {batch_start+idx+1}/{total})")
            
            # 优先用缓存,如果表格中有数据则跳过并打印
            if code_str in cache_map:
                print(f"    > 从缓存中找到 {code_str} 的数据,跳过获取")
                cache_record = cache_map[code_str]
                # 确保缓存数据有完整的字段
                dividend_data.append({
                    'code': code_str,
                    '*均分红': cache_record.get('*均分红', 0),
                    '分红率': cache_record.get('分红率', 0)
                })
                continue
            
            # 检查是否需要间隔60秒(每调用30次)
            if call_count > 0 and call_count % 30 == 0:
                print(f"    > 已调用 {call_count} 次,等待60秒...")
                time.sleep(60)
            
            max_retries = 1
            retry_count = 0
            success = False
            while not success and retry_count < max_retries:
                try:
                    # 随机间隔3到5秒
                    delay = random.uniform(3, 5)
                    time.sleep(delay)
                    
                    div_df = ak.stock_history_dividend_detail(symbol=code_str, indicator="分红")
                    call_count += 1  # 增加调用计数
                    
                    if div_df.empty:
                        # 没有分红数据,插入分红为0的记录
                        record = {
                            'code': code_str,
                            '*均分红': 0,
                            '分红率': 0
                        }
                        dividend_data.append(record)
                        batch_new_records.append(record)
                        new_records.append(record)
                        success = True
                        continue
                    
                    div_df = div_df[div_df['公告日期'].astype(str).str[:4].astype(int).between(year_start, year_end)]
                    if not div_df.empty:
                        div_df['year'] = div_df['公告日期'].astype(str).str[:4].astype(int)
                        year_sum = div_df.groupby('year')['派息'].sum()
                        years = list(range(year_start, year_end+1))
                        year_sum = year_sum.reindex(years, fill_value=0)
                        avg_div = year_sum.sum() / len(years) / 10
                        current_price = price_map.get(code_str, 0)
                        div_rate = (avg_div / current_price * 100) if current_price > 0 else 0
                        record = {
                            'code': code_str,
                            '*均分红': avg_div,
                            '分红率': div_rate
                        }
                        dividend_data.append(record)
                        batch_new_records.append(record)
                        new_records.append(record)
                        success = True
                    else:
                        # 指定年份范围内没有分红数据,插入分红为0的记录
                        record = {
                            'code': code_str,
                            '*均分红': 0,
                            '分红率': 0
                        }
                        dividend_data.append(record)
                        batch_new_records.append(record)
                        new_records.append(record)
                        success = True
                except Exception as e:
                    retry_count += 1
                    error_msg = str(e)[:100]
                    print(f"    {code_str} 分红数据获取失败 ({retry_count}/{max_retries}) 错误: {error_msg}")
                    
                    if retry_count < max_retries:
                        # 指数退避策略,避免频繁重试
                        wait_time = min(30, 2 ** retry_count + random.uniform(1, 5))
                        print(f"    > 等待 {wait_time:.1f} 秒后重试...")
                        time.sleep(wait_time)
                    else:
                        print(f"    {code_str} 分红数据获取最终失败,插入分红为0的记录")
                        # 获取失败时也插入分红为0的记录
                        record = {
                            'code': code_str,
                            '*均分红': 0,
                            '分红率': 0
                        }
                        dividend_data.append(record)
                        batch_new_records.append(record)
                        new_records.append(record)
                        failed += 1
                        # 失败后额外等待,避免连续失败
                        time.sleep(random.uniform(5, 10))
        
        # 每25只保存一次
        if batch_new_records:
            # 确保所有记录都有完整的字段
            batch_df = pd.DataFrame(batch_new_records)
            # 确保所有必需的列都存在
            for col in ['code', '*均分红', '分红率']:
                if col not in batch_df.columns:
                    batch_df[col] = 0
            
            all_df = pd.concat([cache_df, batch_df], ignore_index=True)
            all_df.drop_duplicates('code', keep='last', inplace=True)
            # 验证并确保数据完整性
            all_df = validate_dividend_data(all_df)
            all_df.to_excel(cache_file, index=False)
            cache_df = all_df
            cache_map = {row['code']: row for _, row in cache_df.iterrows()}
            print(f"  > 本批新分红数据已保存到 {cache_file}")
        
        print(f"---- 本批处理完毕,已处理 {min(batch_start+batch_size, total)}/{total} 只 ----\n")
    
    # 合并新数据并保存缓存
    if new_records:
        # 确保所有记录都有完整的字段
        new_df = pd.DataFrame(new_records)
        # 确保所有必需的列都存在
        for col in ['code', '*均分红', '分红率']:
            if col not in new_df.columns:
                new_df[col] = 0
        
        all_df = pd.concat([cache_df, new_df], ignore_index=True)
        all_df.drop_duplicates('code', keep='last', inplace=True)
        # 验证并确保数据完整性
        all_df = validate_dividend_data(all_df)
        all_df.to_excel(cache_file, index=False)
        print(f"  > 新分红数据已追加保存到 {cache_file}")
    else:
        print(f"  > 未有新分红数据写入缓存 {cache_file}")
    print(f"  > 分红数据获取完成,成功: {len(dividend_data)},失败: {failed}")
    
    # 返回最终的数据框,确保所有记录都有完整字段
    final_df = pd.DataFrame(dividend_data)
    final_df = validate_dividend_data(final_df)
    return final_df



def merge_data(basic_df, financial_df, dividend_df):
    """合并所有数据源"""
    merged = basic_df.merge(financial_df, on='code', how='left')
    merged = merged.merge(dividend_df, on='code', how='left')
    
    # 处理缺失值
    merged['分红率'] = merged['分红率'].fillna(0)
    merged['*均分红'] = merged['*均分红'].fillna(0)
    
    # 过滤无效数据
    merged = merged.dropna(subset=['市值(亿)', '营收(亿)', '利润(亿)'])
    merged = merged[merged['市值(亿)'] > 0]  # 排除市值为0的股票
    
    return merged

def get_industry_top_stocks(df):
    """按行业筛选市值前三的龙头股"""
    top_stocks = []
    
    for industry, group in df.groupby('板块'):
        # 排除无财务数据的股票
        valid_group = group.dropna(subset=['市值(亿)'])
        if len(valid_group) == 0:
            continue
            
        # 按市值降序排序
        sorted_group = valid_group.sort_values('市值(亿)', ascending=False)
        # 取前三名
        top_three = sorted_group.head(3)
        top_stocks.append(top_three)
    
    return pd.concat(top_stocks).sort_values('板块')

def print_top_stocks(df):
    """打印龙头股报告"""
    print("\n" + "="*50)
    print("各板块龙头股报告:")
    print("="*50)
    for industry, group in df.groupby('板块'):
        print(f"\n【{industry}】板块龙头股:")
        for _, row in group.iterrows():
            print(f"  {row['code']} {row['name']} | "
                  f"市值:{row['市值(亿)']:.1f}亿 | "
                  f"分红率:{row['分红率']:.2f}% | "
                  f"营收:{row['营收(亿)']:.1f}亿 | "
                  f"利润:{row['利润(亿)']:.1f}亿")

def print_high_dividend_stocks(df):
    """打印高分红股票报告"""
    if not df.empty:
        print("\n" + "="*50)
        print("高分红率股票(>3%):")
        print("="*50)
        for _, row in df.iterrows():
            print(f"  {row['code']} {row['name']} ({row['板块']}) | "
                  f"分红率:{row['分红率']:.2f}% | "
                  f"市值:{row['市值(亿)']:.1f}亿")
    else:
        print("\n未找到分红率>3%的股票")

def save_results(top_stocks, high_dividend_stocks):
    """保存结果到CSV文件"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    top_file = f"龙头股分析_{timestamp}.csv"
    div_file = f"高分红股票_{timestamp}.csv"
    
    # 优化输出格式
    output_columns = ['code', 'name', '板块', '分红率', '市值(亿)', '营收(亿)', '利润(亿)', '*均分红']
    
    top_stocks[output_columns].to_csv(top_file, index=False, float_format='%.2f')
    high_dividend_stocks[output_columns].to_csv(div_file, index=False, float_format='%.2f')
    
    print("\n" + "="*50)
    print(f"结果已保存: {top_file}, {div_file}")
    print("="*50)

if __name__ == "__main__":
    main()
    # 如需单独获取分红数据,可取消注释下面这行
    # get_dividend_data_for_codes()

  

 叠加分析:

1、超跌股票

2、超跌近底股票

image

 具体内容就不展示了,省得坑害别人~


开开心心每一天,写代码的小熊猫~~