爬虫 美国政治献金案例分析

美国2012年总统候选人政治献金数据分析

 

导入包

In [4]:
 
 
 
 
 
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
 
 
 

方便大家操作,将月份和参选人以及所在政党进行定义

In [1]:
 
 
 
 
 
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
          'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick', 
               'Paul, Ron', 'Gingrich, Newt']
parties = {
  'Bachmann, Michelle': 'Republican',
  'Romney, Mitt': 'Republican',
  'Obama, Barack': 'Democrat',
  "Roemer, Charles E. 'Buddy' III": 'Reform',
  'Pawlenty, Timothy': 'Republican',
  'Johnson, Gary Earl': 'Libertarian',
  'Paul, Ron': 'Republican',
  'Santorum, Rick': 'Republican',
  'Cain, Herman': 'Republican',
  'Gingrich, Newt': 'Republican',
  'McCotter, Thaddeus G': 'Republican',
  'Huntsman, Jon': 'Republican',
  'Perry, Rick': 'Republican'           
 }
 
 
 

读取文件

In [5]:
 
 
 
 
 
# 读文件
df = pd.read_csv('./data/usa_election.txt')
df.head(2)
 
 
 
D:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py:2728: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[5]:
 cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_num
0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 250.0 20-JUN-11 NaN NaN NaN SA17A 736166
1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 50.0 23-JUN-11 NaN NaN NaN SA17A 736166
In [7]:
 
 
 
 
 
#新建一列各个候选人所在党派party
# 参看上面的parties 字典
df['party']=df['cand_nm'].map(parties)
df.head(2)
 
 
Out[7]:
 cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 250.0 20-JUN-11 NaN NaN NaN SA17A 736166 Republican
1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 50.0 23-JUN-11 NaN NaN NaN SA17A 736166 Republican
In [8]:
 
 
 
 
 
#party这一列中有哪些元素
df['party'].unique()
 
 
Out[8]:
array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)
In [9]:
 
 
 
 
 
#统计party列中各个元素出现次数
df['party'].value_counts()
 
 
Out[9]:
Democrat       292400
Republican     237575
Reform           5364
Libertarian       702
Name: party, dtype: int64
In [10]:
 
 
 
 
 
#查看各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by='party')['contb_receipt_amt'].sum()
 
 
Out[10]:
party
Democrat       8.105758e+07
Libertarian    4.132769e+05
Reform         3.390338e+05
Republican     1.192255e+08
Name: contb_receipt_amt, dtype: float64
In [11]:
 
 
 
 
 
df.head(2)
 
 
Out[11]:
 cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 250.0 20-JUN-11 NaN NaN NaN SA17A 736166 Republican
1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 50.0 23-JUN-11 NaN NaN NaN SA17A 736166 Republican
In [14]:
 
 
 
 
 
#查看具体每天各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by=['contb_receipt_dt','party'])['contb_receipt_amt'].sum()
 
. . .
In [15]:
 
 
 
 
 
def transform_date(d):
    day,month,year=d.split('-')
    month=months[month]
    return '20'+year+'-'+str(month)+'-'+day

 
 
In [16]:
 
 
 
 
 
#将表中日期格式转换为'yyyy-mm-dd'
df['contb_receipt_dt'] = df['contb_receipt_dt'].map(transform_date)
 
 
In [18]:
 
 
 
 
 
df.head(1)
 
 
Out[18]:
 cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 3.6601e+08 RETIRED RETIRED 250.0 2011-6-20 NaN NaN NaN SA17A 736166 Republican
In [19]:
 
 
 
 
 
#查看老兵(捐献者职业)DISABLED VETERAN主要支持谁
#1.找出老兵对应的行数据
df['contbr_occupation'] == 'DISABLED VETERAN'
old_bing_df = df.loc[df['contbr_occupation'] == 'DISABLED VETERAN']
#对候选人分组对金额做聚合
old_bing_df.groupby(by='cand_nm')['contb_receipt_amt'].sum()
 
 
Out[19]:
cand_nm
Cain, Herman       300.00
Obama, Barack     4205.00
Paul, Ron         2425.49
Santorum, Rick     250.00
Name: contb_receipt_amt, dtype: float64
In [ ]:
 
 
 
 
 
#捐赠金额最大的人的职业以及捐献额  .通过query("查询条件来查找捐献人职业")
 
 
In [22]:
 
 
 
 
 
df['contb_receipt_amt'].max()
 
 
Out[22]:
1944042.43
In [23]:
 
 
 
 
 
df.query('contb_receipt_amt == 1944042.43')
 
 
Out[23]:
 cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
176127 C00431445 P80003338 Obama, Barack OBAMA VICTORY FUND 2012 - UNITEMIZED CHICAGO IL 60680 NaN NaN 1944042.43 2011-12-31 NaN X * SA18 763233 Democrat
posted @ 2019-08-13 22:16  我的IT007  阅读(185)  评论(0)    收藏  举报