数据结构

#系列
from pandas import Series
a=Series([1,2,3])
print(a)
from pandas import Series
a=Series([1,2,3],index=[1,2,3])
print(a)
from pandas import Series
a=Series([1,2,3],index=['a','b','c'])
print(a)

from pandas import Series
a=Series([12,3,5])
print(a)
print(a[0])
print(a[1])
print(a[5])#超出index长度就会报错

from pandas import Series
x=Series(['a',True,1],index=['first','second','third'])
x[1]#访问
x['second']#通过index查找元素
x[3]#不能越界访问
x.append('2')#不能追加元素，可以追加一个系列
a=Series(['2'])
x.append(a)#返回一个新的序列，需要用一个变量来承载变化
2 in x.values #判断值是否在系列里，数值和布尔值不用加引号
x[1:3]#切片
x[[0,2,1]]#定位获取 常用于随机抽样
x.drop('first')#用index删除
x.drop(x.index[3])#根据位置删除，并返回新的序列
x[2!=x.values]#根据值删除，显示不等于2的系列，删除2，返回新序列
x.index[x.values(x=="2")]#通过值访问系列号index
x.index=[0,1,2,3,4]#通过赋值修改系列中的index
s=({'a':1,'b':2,'c':3})#将字典转化为系列
#reindex重新排序
A=Series([4,5,67,8,9],index=["a","b","c","d","e"])
print(A)
B=A.reindex(["a","b","c","d","e","f"])
B
C=A.reindex(["a","b","c","d","e","f"],fill_value="*")
C

#DateFrame 是用于储存多行多列的数据集合，是Series的容器
#单词首字母大写，使用数据框时，要先从pandas中导入。

from pandas import DataFrame
df=DataFrame({'age':Series([21,22,32]),'name':Series(['lili','july','llj'])},index=[0,1,2])
df
A=df['age']
print(A)
B=df[0:1]#获取序列号是第一行的值
B
C=df.iloc[0:2,0:2]#获取第一行到第二行与第一列到第二列的块
C
D=df.at[0,'name']#获取第一行与name列的交叉值
D
from pandas import DataFrame
df1=DataFrame({'age':[21,11,23],'name':['lil','gbk','pig']})
df1['newColumn']=[2,4,6]
df1
#合并两个数据框
#不修改index
from pandas import DataFrame
df=DataFrame([[1,2],[3,4]],columns=list('AB'))
df1=DataFrame([[5,6],[7,8]],columns=list('AB'))
df.append(df1)
#修改index
from pandas import DataFrame
df=DataFrame([[1,2],[3,4]],columns=list('AB'))
df1=DataFrame([[5,6],[7,8]],columns=list('AB'))
df.append(df1,ignore_index=True)

数据导入
（1）导入TXT文件
（2）导入Excel文件
（3）导入CSV文件
（4）导入MYSQL库

from pandas import read_table
df=read_table('F:/shuju/rz1.txt',names=['YHM','DLSJ','TCSJ','YWXT','IP','REMARK'],sep=",")
df

from pandas import read_csv
df=read_csv('F:/shuju/rz20.csv',names=['YHM','DLSJ','TCSJ','YWXT','IP','REMARK'],sep=",")
df
from pandas import read_excel
df=read_excel('F:/shuju/rz1.xlsx',sheetname='Sheet2',header=1)
df


import pandas
import MySQLdb
connection=MySQLdb.connect(
        host='127.0.0.1',
        user='root',
        passwd='',
        db='test',
        port=5029,
        charset='utf8')
data=pandas.read_sql("select*from t_user;",con=connection)   
connection.close()

数据导出

from pandas import DataFrame
from pandas import Series
df=DataFrame({'age':Series([11,22,33]),'name':Series(['jhon','jerry','ben'])})
df.to_excel('D:\\01.xlsx)


from pandas import DataFrame
from pandas import read_excel
df=read_excel('F:/shuju/rz2.xlsx')
df

#字段拆分
from pandas import DataFrame
from pandas import read_excel
df=read_excel('F:/shuju/rz2.xlsx')
newdf=df['IP'].str.strip#IP转成字符串的形式，再删除首位空格
newdf=df['IP'].str.split('.',1,True)#按照第一个.分成两列，1表示新增的列数
newdf
newdf.columns=['IP1','IP2']
newdf

#记录抽取
import pandas
from pandas import read_excel
df=read_excel('F:/shuju/rz2.xlsx')
df[df.TCSJ==13322252452]#按照一定的条件对记录进行抽取
df
df[df.TCSJ>13500000000]
df
df[df.TSCJ.betwwen(13400000000,13999999999)]
df

#随机抽样
import numpy
import pandas
from pandas import read_excel
df=read_excel('F:/shuju/rz2.xlsx')
df
r=numpy.random.randint(0,10,3)#确定开始终止的地方，并确定抽样的个数
r#返回行的索引值序列
df.loc[r,:]#显示r的行数
df.iloc[1,1]#输出第二行第二列的一个值
df.iloc[[0,2],:]#输出第一行和第三行的数据
df.iloc[0:2,:]#输出第一行到第三行（不包含）的数据
df.iloc[:,1]#输出第一列的值，返回一个系列
df.iloc[1,:]#输出第一行的数据返回一个系列
#iloc表示整形索引，索引的是位置，不是行号或者标签
#loc表示字符串索引，索引的是行号，标签 不是位置
#ix可以索引位置、行号、标签
import pandas as pd
index_loc=['a','b']
index_iloc=[1,2]
data=[[1,2,3,4],[5,6,7,8]]
columns=['one','two','three','four']
df1=pd.DataFrame(data=data,index=index_loc,columns=columns)
df2=pd.DataFrame(data=data,index=index_iloc,columns=columns)
df1.loc['a']
df1.iloc['a']#报错 iloc不能索引字符串
df2.iloc[0]#iloc是按行位置进行索引
df2.loc[1]#loc索引的是行号
df1.ix[0]
df1.ix['a']

#字典数据-将字典数据抽取为dataframe
import pandas
from pandas import DataFrame
d1={'a':'[1,2,3]','b':'[4,5,6]'}
a1=pandas.DataFrame.from_dict(d1,orient='index')#将字典转换成dataframe，把key列做成index 
a1.index.name='key'#将index列名修改成key
b1=a1.reset_index()#重新增加index，并将原来的index做成了key
b1.columns=['key','value'] #对列重新命名
b1

#将字典里的每一个元素作为一列，这时候字典里的value长度必须相等
import pandas
from pandas import DataFrame
 d2={'a':[1,2,3],'b':[4,5,6]}
a2=DataFrame(d2)
a2
#value长度不用相等也可以
import pandas
from pandas import DataFrame
d={'one':pandas.Series([1,2,3]),'second':pandas.Series([4,5,6,7])}
df=pandas.DataFrame(d)
df

#排名索引
from pandas import DataFrame
df1={'ohio':[0,6,3],'texi':[7,4,1],'california':[2,8,5]}
df=DataFrame(df1,index=['a','b','c'])
df
df.sort_index(by='ohio')#针对这一列进行排序

#重新索引
from pandas import Series
s=Series([1,2,3,4,5],index=['d','b','a','c','e'])
A=['a','b','c','d','e','f']
s.reindex(A)
s=s.reindex(A,fill_value=0)#赋值缺失值
s
s.reindex(A,method='pad')#ffill=pad 代表向前取值，bfill代表向后取值

#dataframe对象重新索引方法，多了一个可选的columns，用于给列索引
from pandas import DataFrame
df1={'ohio':[0,6,3],'texi':[7,4,1],'california':[2,8,5]}
df=DataFrame(df1,index=['a','b','c'])
df
state=['ohio','california','utha']
df.reindex(columns=state)
df.reindex(index=['a','b','c'],columns=state,method='ffill')

#数据合并（1）记录合并---两个结构相同的数据框进行合并
import pandas
from pandas import DataFrame
from pandas import read_excel
df1=read_excel('F:\shuju//rz2.xlsx')
df2=read_excel('F:\shuju//rz3.xlsx')
df=pandas.concat([df1,df2])#实现记录数据的叠加或者顺延
df
#（2）字段合并
import pandas
from pandas import DataFrame
from pandas import read_csv
df=read_csv('F:/shuju//rz4.csv',sep=' ',names=['brand','area','num'])
df
df=df.astype(str)
tel=df['brand']+df['area']+df['num']#返回值为Series，合并后的系列
tel
#（3）字段匹配---不同结构的数据框按照一定的条件进行合并，即追加列
import pandas
from pandas import DataFrame
from pandas import read_excel
df1=read_excel('F:/shuju/rz2.xlsx',sheetname='Sheet3')
df2=read_excel('F:/shuju/rz2.xlsx',sheetname='Sheet4')
pandas.merge(df1,df2,left_on='id',right_on='id')
#SyntaxError: invalid character in identifier注意这种报错 检查有没有 中文字符 多余的空格 tab 等

#数据计算（1）简单计算---进行加减乘除等运算，算出的结果作为新的字段
from pandas import read_csv
df=read_csv('F:/shuju//rz2.csv',sep=',')
df
r=df.price*df.num
r
df['r']=r
df
#（2）数据标准化
from pandas import read_csv
df=read_csv('F:/shuju//rz2.csv',sep=',')
s=(df.price-df.price.min())/(df.price.max()-df.price.min())
s
df['s']=s
df

#数据分组
import pandas
from pandas import DataFrame
from pandas import read_csv
df=read_csv('F:/shuju//rz2.csv',sep=',')
df
bins=[min(df.price)-1,500,max(df.price)+1]#确定分组的依据
labels=['500以下','500以上']#分组的自定义标签
pandas.cut(df.price,bins)
pandas.cut(df.price,bins,right=False)#确定右边不闭合
p=pandas.cut(df.price,bins,right=False,labels=labels)
p
df['label']=pandas.cut(df.price,bins,right=False,labels=labels)
df

#日期处理(1)日期转换---将字符型的日期格式转换成日期格式数据的过程
from pandas import read_csv
from pandas import to_datetime
df=read_csv('F:/shuju//rz3.csv',sep=',',encoding='utf8')
df
df_dt=to_datetime(df.date,format='%Y/%m/%d')
df_dt
#（2）日期格式化---将日期型数据按照给定的格式转化为字符型数据
from pandas import read_csv
from pandas import to_datetime
from datetime import datetime
df=read_csv('F:/shuju//rz3.csv',sep=',',encoding='utf8')
df_dt=to_datetime(df.date,format='%Y/%m/%d')
df_dt_str=df_dt.apply(lambda x:datetime.strftime(x,'%Y/%m/%d'))
df_dt_str
#将函数f应用到dataframe的行或者列时，用apply（）axis=0表示按列计算，axis=1 表示按行计算
from pandas import DataFrame 
df=DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
df
f=lambda x: x.max()-x.min()
df.apply(f)#默认按列进行计算
df.apply(f,axis=1)#默认按行进行计算
#（3）日期抽取---从日期中抽取需要的部分属性
from pandas import read_csv
from pandas import to_datetime
df=read_csv('F:/shuju//rz3.csv',sep=',',encoding='utf8')
df_dt=to_datetime(df.date,format='%Y/%m/%d')
df_dt.dt.year
df_dt.dt.day
df_dt.dt.month
df_dt.dt.weekday
df_dt.dt.hour
df_dt.dt.second

posted on 2018-09-09 15:10 lljaway 阅读(254) 评论(0) 收藏举报

刷新页面返回顶部

数据结构

导航

公告