pandas_format06

如何得到按列分组的dataframe的平均值和标准差

df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 2,
                   'col2': np.random.randint(0,15,6),
                   'col3': np.random.randint(0, 15, 6)})
print(df)
# 按列col1分组后的平均值
df_grouped_mean = df.groupby(['col1']).mean()
print(df_grouped_mean)
# 按列col1分组后的标准差
df_grouped_std = df.groupby(['col1']).mean()
print(df_grouped_std)

#>         col1  col2  col3
    0   apple     2    14
    1  banana    11     8
    2  orange     8    10
    3   apple     5     2
    4  banana     6    12
    5  orange    11    13
#>            col2  col3
    col1              
    apple    3.5   8.0
    banana   8.5  10.0
    orange   9.5  11.5
#>            col2  col3
    col1              
    apple    3.5   8.0
    banana   8.5  10.0
    orange   9.5  11.5
  1. 如何得到按列分组后另一列的第n大的值
    df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 2,
                       'taste': np.random.rand(6),
                       'price': np.random.randint(0, 15, 6)})
    
    print(df)
    
    # teste列按fruit分组
    df_grpd = df['taste'].groupby(df.fruit)
    # teste列中banana元素的信息
    x=df_grpd.get_group('banana')
    # 排序并找第2大的值
    s = x.sort_values().iloc[-2]
    print(s)
    
    #>        fruit     taste  price
        0   apple  0.521990      7
        1  banana  0.640444      0
        2  orange  0.460509      9
        3   apple  0.818963      4
        4  banana  0.646138      7
        5  orange  0.917056     12
    
    #>    0.6404436436085967
  2. 如何计算分组dataframe的平均值,并将分组列保留为另一列
    df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 2,
                       'rating': np.random.rand(6),
                       'price': np.random.randint(0, 15, 6)})
    
    # 按fruit分组后,price列的平均值,并将分组置为一列
    out = df.groupby('fruit', as_index=False)['price'].mean()
    print(out)
    
    #>        fruit  price
        0   apple    4.0
        1  banana    6.5
        2  orange   11.0
    33.如何获取两列值元素相等的位置(并非索引)
    df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 3),
                        'fruit2': np.random.choice(['apple', 'orange', 'banana'], 3)})
    
    print(df)
    # 获取两列元素相等的行
    np.where(df.fruit1 == df.fruit2)
    
    #>       fruit1  fruit2
        0   apple  banana
        1   apple   apple
        2  orange   apple
    
    #>    (array([1], dtype=int64),)
  3. 如何创建指定列偏移后的新列
    df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4), columns = list('abcd'))
    
    # 创建往下偏移后的列
    df['a_lag1'] = df['a'].shift(1)
    # 创建往上偏移后的列
    df['b_lead1'] = df['b'].shift(-1)
    print(df)
    
    #>        a   b   c   d  a_lag1  b_lead1
        0  29  90  43  24     NaN     36.0
        1  94  36  67  66    29.0     76.0
        2  81  76  44  49    94.0     97.0
        3  55  97  10  74    81.0     43.0
        4  32  43  62  62    55.0      NaN
  4. 如何获得dataframe中单一值的频数
    df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), columns = list('abcd'))
    
    # 统计元素值的个数
    pd.value_counts(df.values.ravel())
    
    #>    9    3
        7    3
        3    3
        1    3
        6    2
        5    2
        4    2
        8    1
        2    1
        dtype: int64
  5. df = pd.DataFrame(["STD, City    State",
    "33, Kolkata    West Bengal",
    "44, Chennai    Tamil Nadu",
    "40, Hyderabad    Telengana",
    "80, Bangalore    Karnataka"], columns=['row'])
    
    print(df)
    # expand=True表示以分割符把字符串分成两列
    df_out = df.row.str.split(',|\t', expand=True)
    
    # 获取新的列
    new_header = df_out.iloc[0]
    # 重新赋值
    df_out = df_out[1:]
    df_out.columns = new_header
    print(df_out)
    #>	                          row
    	0          STD, City    State
    	1  33, Kolkata    West Bengal
    	2   44, Chennai    Tamil Nadu
    	3  40, Hyderabad    Telengana
    	4  80, Bangalore    Karnataka
    
    #>	0 STD     City       State
    	1  33   Kolkata    West Bengal
    	2  44    Chennai    Tamil Nadu
    	3  40   Hyderabad    Telengana
    	4  80   Bangalore    Karnataka
    

      

    如何将文本拆分为两个单独的列
  6. 如何构建多级索引的dataframe
    我们利用元组(Tuple)构建多级索引,然后定义dataframe.
    # 如何构建多级索引的dataframe
    # 先通过元组方式构建多级索引
    import numpy as np
    outside = ['A','A','A','B','B','B']
    inside =[1,2,3,1,2,3]
    my_index = list(zip(outside,inside))
    # my_index
    # 转化为pd格式的索引
    my_index = pd.MultiIndex.from_tuples(my_index)
    # my_index
    # 构建多级索引dataframe
    df = pd.DataFrame(np.random.randn(6,2),index =my_index,columns=['fea1','fea2'])
    df
    多索引dataframe结果:
    
    获取多索引dataframe的数据:
    df.loc['A'].iloc[1]
    
    #>    fea1   -0.794461
        fea2    0.882104
        Name: 2, dtype: float64
    
    df.loc['A'].iloc[1]['fea1']
    
    #>    -0.7944609970323794

     

posted @ 2021-12-14 11:42  青竹之下  阅读(116)  评论(0)    收藏  举报