数据分析(Pandas模块:政治献金分析)

  导包:

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

  将月份和参选人以及所在政党进行定义:

months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
          'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick', 
               'Paul, Ron', 'Gingrich, Newt']
parties = {
  'Bachmann, Michelle': 'Republican',
  'Romney, Mitt': 'Republican',
  'Obama, Barack': 'Democrat',
  "Roemer, Charles E. 'Buddy' III": 'Reform',
  'Pawlenty, Timothy': 'Republican',
  'Johnson, Gary Earl': 'Libertarian',
  'Paul, Ron': 'Republican',
  'Santorum, Rick': 'Republican',
  'Cain, Herman': 'Republican',
  'Gingrich, Newt': 'Republican',
  'McCotter, Thaddeus G': 'Republican',
  'Huntsman, Jon': 'Republican',
  'Perry, Rick': 'Republican'           
 }

  读取文件:

df = pd.read_csv('./data/usa_election.txt')
df.head()
#结果:>>>
cmte_id    cand_id      cand_nm     contbr_nm    contbr_city    contbr_st         contbr_zip    contbr_employer    contbr_occupation    contb_receipt_amt       contb_receipt_dt    receipt_desc    memo_cd    memo_text     form_tp    file_num
0        C00410118    P20002978    Bachmann,   Michelle      HARVEY, WILLIAM     MOBILE      AL            3.6601e+08        RETIRED RETIRED 250.0    20-JUN-11        NaN          NaN      NaN        SA17A      736166
1        C00410118    P20002978    Bachmann,   Michelle      HARVEY, WILLIAM     MOBILE      AL            3.6601e+08         RETIRED    50.0          23-JUN-11        NaN          NaN      NaN        SA17A      736166
2        C00410118    P20002978    Bachmann,   Michelle      SMITH, LANIER      LANETT      AL            3.68633e+08       INFORMATION REQUESTED    05-JUL-11        NaN          NaN      NaN        SA17A      749073
3        C00410118    P20002978    Bachmann,   Michelle      BLEVINS, DARONDA    PIGGOTT      AR            7.24548e+08        NONE    RETIRED 250.0    01-AUG-11        NaN          NaN      NaN        SA17A      749073
4        C00410118    P20002978    Bachmann,   Michelle      WARDENBURG, HAROLD  HOT SPRINGS  NATION    AR      7.19016e+08       NONE    RETIRED  300.0   20-JUN-11        NaN          NaN      NaN        SA17A      736166

  需求分析与实现:

#新建一列各个候选人所在党派party
df['party'] = df['cand_nm'].map(parties)
df.head(1)
#结果:>>>
cmte_id    cand_id    cand_nm    contbr_nm    contbr_city    contbr_st    contbr_zip    contbr_employer    contbr_occupation    contb_receipt_amt    contb_receipt_dt    receipt_desc    memo_cd    memo_text    form_tp    file_num    party
0    C00410118    P20002978    Bachmann, Michelle    HARVEY, WILLIAM    MOBILE    AL    3.6601e+08    RETIRED    RETIRED    250.0    20-JUN-11    NaN    NaN    NaN    SA17A    736166    Republican

#party这一列中有哪些元素
df['party'].unique()
#结果:>>>
array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)

#统计party列中各个元素出现次数
df['party'].value_counts()
#结果:>>>
Democrat       292400
Republican     237575
Reform           5364
Libertarian       702
Name: party, dtype: int64

#查看各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by='party')['contb_receipt_amt'].sum()
#结果>>>
party
Democrat       8.105758e+07
Libertarian    4.132769e+05
Reform         3.390338e+05
Republican     1.192255e+08
Name: contb_receipt_amt, dtype: float64

#查看具体每天各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by=['contb_receipt_dt','party'])['contb_receipt_amt'].sum()

#定义日期转换函数
def transform_date(d):
    day,month,year = d.split('-')
    month = months[month]
    return '20'+year+'-'+str(month)+'-'+day

#将表中日期格式转换为'yyyy-mm-dd'
df['contb_receipt_dt'] = df['contb_receipt_dt'].map(transform_date)
df.head(1)
#结果:>>>
cmte_id    cand_id    cand_nm    contbr_nm    contbr_city    contbr_st    contbr_zip    contbr_employer    contbr_occupation    contb_receipt_amt    contb_receipt_dt    receipt_desc    memo_cd    memo_text    form_tp    file_num    party
0    C00410118    P20002978    Bachmann, Michelle    HARVEY, WILLIAM    MOBILE    AL    3.6601e+08    RETIRED    RETIRED    250.0    2011-6-20    NaN    NaN    NaN    SA17A    736166    Republican


#查看老兵(捐献者职业)DISABLED VETERAN主要支持谁
#1.找出老兵对应的行数据
df['contbr_occupation'] == 'DISABLED VETERAN'
old_bing_df = df.loc[df['contbr_occupation'] == 'DISABLED VETERAN']


#对候选人分组对金额做聚合
old_bing_df.groupby(by='cand_nm')['contb_receipt_amt'].sum()
#结果>>>
cand_nm
Cain, Herman       300.00
Obama, Barack     4205.00
Paul, Ron         2425.49
Santorum, Rick     250.00
Name: contb_receipt_amt, dtype: float64

#捐赠金额最大的人的职业以及捐献额  .通过query("查询条件来查找捐献人职业")
df['contb_receipt_amt'].max()
#结果>>>1944042.43

df.query('contb_receipt_amt == 1944042.43')
#结果>>>
cmte_id    cand_id    cand_nm    contbr_nm    contbr_city    contbr_st    contbr_zip    contbr_employer    contbr_occupation    contb_receipt_amt    contb_receipt_dt    receipt_desc    memo_cd    memo_text    form_tp    file_num    party
176127    C00431445    P80003338    Obama, Barack    OBAMA VICTORY FUND 2012 - UNITEMIZED    CHICAGO    IL    60680    NaN    NaN    1944042.43    2011-12-31    NaN    X    *    SA18    763233    Democrat

 

posted @ 2019-08-13 22:32  Amorphous  阅读(250)  评论(0编辑  收藏  举报