# -*- coding: utf-8 -*-"""Created on Wed Feb 15 20:36:15 2017@author: zzpp220"""import json,sysfrom pandas import DataFrame,Seriesimport pandas as pdimport numpy as np#=======================dataset1-web用户数据=======================================================# path='/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'# records=[json.loads(lines) for lines in open(path)]##json可将java脚本的格式化成字典对象# # frame=DataFrame(records)# ####==============================================================================# tz=frame['tz']##只取出该列的所有值,或者tz=frame.tz# # ##frame['tz']返回的col_tz(series)有 一个value_counts的方法, return series 统计出tz中想同内容有多少个# # col_tz=frame['tz'].value_counts() ##返回只有一个元素的元祖对象# # subna_tz=tz.fillna('Missing')## repalce 'nan' with 'Missing'# # subna_tz[subna_tz=='']='Unknown'##!!!查找空白并且替换空白的方法# # col_tz_1=subna_tz.value_counts()# # col_tz_1[:10].plot(kind='barh',rot=0)# #==============================================================================# a=frame.a.dropna()##去掉‘a’字段中的nan值# tmp2=[x.split()[0] for x in a]##取'a'字段中每条记录的第一个空格前的内容# result=Series(tmp2)# #print result[:10]# count_result=result.value_counts()# ##将a字段的非nan的记录按照Windows与否的进行分解# notnull_a=frame.a.notnull()# frame_notnulla=frame[notnull_a]# condition=frame_notnulla['a'].str.contains('Windows')# operation_system=np.where(condition,'Windows','Not Windows')# ##根据'tz'字段和新得到的操作系统对数据表进行分组:# #==============================================================================# # tz[tz=='']='Unknown'# # gru_tz_os=frame_notnulla.groupby([tz,operation_system])# # agg_counts=gru_tz_os.size().unstack().fillna(0)# #==============================================================================# gru_tz_os=frame_notnulla.groupby(['tz',operation_system])# agg_counts=gru_tz_os.size().unstack().fillna(0)# indexer=agg_counts.sum(1)##对每行的条目进行求和# sort_index=indexer.argsort()##给每个条目按照总和的高低进行排序,给出索引号# # sub_sort_index=agg_counts.take(sort_index)[-10:]###按照上面索引的顺序截取分组表的后10行记录# sub_sort_index.plot(kind='barh',stacked=True)##堆积# ##归一化,改进优化比例显示# normed=sub_sort_index.div(sub_sort_index.sum(1),axis=0)# normed.plot(kind='barh',stacked=True)#==============================================================================#==================dataset2-电影评分============================================================# # dir_path='/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch02/movielens/'# # #==================将每个表分别读到一个DataFrame的对象中============================================================# unames=['user_id','gender','age','occupation','zip_code']##表中的各个字段# users=pd.read_table(dir_path+'users.dat',sep='::',header=None,names=unames)# # mnames=['movie_id','title','genres']# movies=pd.read_table(dir_path+'movies.dat',sep='::',header=None,names=mnames)# # rnames=['user_id','movie_id','rating','timestamp']# ratings=pd.read_table(dir_path+'ratings.dat',sep='::',header=None,names=rnames)# #==============================================================================# # ##==================依次连接多个DataFrame的对象=======================# whole=pd.merge(pd.merge(users,ratings),movies)##表的顺序就是总表中字段的排列顺序,相同的字段名会自动判断# ##whole.ix[0] 这是每天记录的查看方法,不能直接whole[0]# ##按性别计算每部电影的平均分,所以要选择的字段为'title','gender'其中名字作为行名,性别做为字段名# avg_ratings=whole.pivot_table('rating',index='title',columns='gender',aggfunc='mean')# # # ##先对title进行分组,然后利用size()得到一个包含各电影分组大小的Series对象--rating_by_title# ##count_movie=whole.title.value_counts()--作用同下# rating_by_title=whole.groupby('title').size()##直接whole.groupby('title')不会有任何输出# needed_movie=rating_by_title.index[rating_by_title>=250]##筛选出评分数大于250的电影名# avg_rating_needed=avg_ratings.ix[needed_movie]# # desc_feamale_rating=avg_rating_needed.sort_index(by='F',ascending=False)##按女性的喜爱程度降序排列# # ##计算男女分歧最大的电影# avg_rating_needed['diff']=avg_rating_needed['M']-avg_rating_needed['F']# sort_by_diff=avg_rating_needed.sort_index(by='diff')# # ### #========单纯计算分歧最大的电影,计算得分数据的方差或者标准差:对whole按电影名分类======# ##按照电影名分类,对每个电影类内的rating列进行求平均差# rating_std_title=whole.groupby('title')['rating'].std()# ##从计算后的列表中选出评分记录大于250的电影名# rating_std_title=rating_std_title.ix[needed_movie]# ##按降序排列,第一即为方差最大的电影,即为分歧最大的电影# rating_std_title.order(ascending=False)[:10]# #==============================================================================# # #================dataset3-婴儿姓名==============================================================#path_name='/media/zzpp220/Data/Linux_Documents/DOWNLOAD/python-DataAnalysis/pydata-book-master/ch02/names/'columns=['name','gender','births']#names1880=pd.read_csv(path_name+'yob1880.txt',names=['name','gender','births'])##birth表示当年出生使用该名字的次数#birth_sum=names1880.groupby('gender').births.sum()#按'gender'列分组,在每组内,对births列求和#################将所有年度的表的信息汇总到一个总表,并加上列yearyears=range(1880,2011)piece=[]for year in years: sep_path=path_name+'yob%d.txt' % year sep_frame=pd.read_csv(sep_path,names=columns) sep_frame['year']=year##为每个表添加字段-年,并赋值为当年 piece.append(sep_frame)names=pd.concat(piece,ignore_index=True)##!!!默认按行组合多个##按照性别(column,字段名)对每年(index行名)的births(首参数)进行求和(aggfunc)total_births=names.pivot_table('births',index='year',columns='gender',aggfunc=sum)total_births.plot(title='Total births by gneder and year')#groupby_total_births=names.groupby(['year','gender']).births.sum()#groupby_total_births.plot(title='Total births by gneder and year')