import pandas as pd
import numpy as np
#创建Series对象的方法
# #指定index,可以不按顺序,不连续:pd.Series(data, index=index)
x = pd.Series([1,2,3,4], index=[3,4,5,6])
print(x)
#data可以为标量,类似广播
x = pd.Series("Hanks" ,index = [1,2,4,5])
print(x)
#data可以是字典,index默认是排序的字典键,series对象仅仅保留index定义的key-value对
x = pd.Series({3:'c',2:'b',1:'a'},index=[2,3])
print(x)
#创建DataFrame对象的方法
population = {'henan':1000,'shandong':200,'hubei':400}
area = {'henan':98,'shandong':900,'hubei':4000}
population = pd.Series(population)#dataframe里的对象必须是series
province = pd.DataFrame({'population':population,'area':area})
print(province)
print(province['area'])#与一般多维数组不同,该操作返回的是一列
#二维数组建立dataframe对象
abc = pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'],
index=['a', 'b', 'c'])
print(abc)
#index:不可变数组 not mutable
x = pd.Index([3,2,5,9])
x[4] = 5#该句子报错:Index does not support mutable operations
print(x)
y = pd.Index([4,6,9,23,3])
print(x & y)#交集
print(x | y)#并集
print(x ^ y)#差集
#索引器:loc,iloc和ix
data = pd.Series(['a','b','c'] , index=[1,3,5])
print(data[3])#显式索引
print(data[1:3])#隐式索引
print(data.loc[1:3])#显式索引
print(data.iloc[1:3])#隐式索引
#dataframe的取值方法
area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
print(data.area,data.pop)
# #属性获取方法并不总是有用,当列名与方法名重合或者不全为字符串时,不可使用属性方法
print(data.area is data['area'])
data['density'] = data['pop']/data['area']
print(data.values)
print(data.T)
#iloc
print(data.iloc[:3,:2])
#loc
print(data.loc[:'Illinois',:'pop'])
#ix:混合,该功能已经被移除
print(data.ix[:3,:'pop'])
x = np.random.RandomState(43)
print(x)
#pandas计算:一元运算保留索引和列标签;二元计算自动对其索引进行计算
area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, name='population')
print(area/population)#索引对齐
x = area/population
A = np.random.randint(10,size=(3,4))
print(A)
print(A - A[0])
print(x.isnull())
print(x[x.notnull()])
print(x.dropna())
print(x)
df = pd.DataFrame([[1, np.nan, 2]
,[2, 3 , 5]
,[np.nan , 4 ,6]])
print(df.dropna())
print(df.dropna(axis='columns'))
df[3] = np.nan
print(df)
print(df.dropna(axis='columns' , how='all'))
print(df.dropna(axis='rows' , thresh=3))
#全局填充
print(df.fillna(9999))
#前值填充
print(df.fillna(method='ffill',axis=1))
#后值填充
print(df.fillna(method='bfill',axis=1))