padas 操作及数据库2
Series的创建
使用Python的列表创建Series:
import numpy as np import pandas as pd # 使用list创建 s1 = pd.Series([1,2,3,4]) # 可以发现索引index默认从0开始进行自动索引 s1 0 1 1 2 2 3 3 4 dtype: int64 # 值属性,可以方便查看Series的值 s1.values array([1, 2, 3, 4], dtype=int64) # 索引index属性,返回的是索引从开始到结束和间隔的值 s1.index RangeIndex(start=0, stop=4, step=1) # 使用numpy的数组进行创建 s2 = pd.Series(np.arange(10)) s2 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 dtype: int32 # 通过字典进行创建 s3 = pd.Series({'1':1, '2':2, '3':3}) s3 1 1 2 2 3 3 dtype: int64 s3.values array([1, 2, 3], dtype=int64) s3.index Index(['1', '2', '3'], dtype='object') # 手动赋值索引 s4 = pd.Series([1,2,3,4], index=['A','B','C','D']) s4 A 1 B 2 C 3 D 4 dtype: int64 s4.values array([1, 2, 3, 4], dtype=int64) s4.index Index(['A', 'B', 'C', 'D'], dtype='object') s4['A'] # 根据索引取值 1 s4[s4>1] # 根据值得范围取值 B 2 C 3 D 4 dtype: int64 s4.to_dict() # 把Series转换为字典输出,也就是说可以通过字典创建Series,也可以通过Series转换为字典 {'A': 1, 'B': 2, 'C': 3, 'D': 4} s5 = pd.Series(s4.to_dict()) # 来回转 s5 A 1 B 2 C 3 D 4 dtype: int64 index_1 = ['A','B','C','D','E'] # 可单独把索引写出,再赋值给Series,同时多增加一个索引 s6 = pd.Series(s5, index=index_1) s6 # 多增加的索引的值为NAN A 1.0 B 2.0 C 3.0 D 4.0 E NaN dtype: float64 pd.isnull(s6) # 根据pd.isnall()判断Series的元素是否有空值,如果有返回Ture,反之False A False B False C False D False E True dtype: bool pd.notnull(s6) # 类似的操作 A True B True C True D True E False dtype: bool s6.name = 'demo' # 给Series赋予名字 s6 A 1.0 B 2.0 C 3.0 D 4.0 E NaN Name: demo, dtype: float64 s6.index.name = 'demo_index' # 给索引起个名字 s6 demo_index A 1.0 B 2.0 C 3.0 D 4.0 E NaN Name: demo, dtype: float64 s6.index Index(['A', 'B', 'C', 'D', 'E'], dtype='object', name='demo_index')
pandas的Dataframe
pandas的Dataframe学习 import numpy as np import pandas as pd from pandas import Series, DataFrame import webbrowser link = 'https://www.tiobe.com//tiobe-index/' webbrowser.open(link) True df = pd.read_clipboard() # 读取粘贴板的内容 df Sep 2018 Sep.1 2017 Change Programming Language Ratings Change.1 0 0 1 1 NaN Java 17.436% +4.75% NaN NaN 1 1 2 2 NaN C 15.447% +8.06% NaN NaN 2 2 3 5 change Python 7.653% +4.67% NaN NaN 3 3 4 3 change C++ 7.394% +1.83% NaN NaN 4 4 5 8 change Visual Basic .NET 5.308% +3.33% 5 5 6 4 change C# 3.295% -1.48% NaN NaN 6 6 7 6 change PHP 2.775% +0.57% NaN NaN 7 7 8 7 change JavaScript 2.131% +0.11% NaN NaN 8 8 9 - change SQL 2.062% +2.06% NaN NaN 9 9 10 18 change Objective-C 1.509% +0.00% NaN NaN type(df) pandas.core.frame.DataFrame df.columns # 读取索引,该操作作用很大,可以读取行的索引,以此可以通过该索引进行访问 Index(['Sep', '2018', 'Sep.1', '2017', 'Change', 'Programming', 'Language', 'Ratings', 'Change.1'], dtype='object') df.Ratings # 通过标签读取该列内容 0 17.436% 1 15.447% 2 7.653% 3 7.394% 4 5.308% 5 3.295% 6 2.775% 7 2.131% 8 2.062% 9 1.509% Name: Ratings, dtype: object # 从原先的dataframe中提取想要的数据,并赋给新的dataframe,该操作很有用 df_new = DataFrame(df, columns=['Sep 2019','Sep 2018', 'Change', 'Programming Language']) # 同时增加新的标签,并自动赋值nan df_new Sep 2019 Sep 2018 Change Programming Language 0 NaN 1 NaN Java 1 NaN 2 NaN C 2 NaN 3 change Python 3 NaN 4 change C++ 4 NaN 5 change Visual Basic .NET 5 NaN 6 change C# 6 NaN 7 change PHP 7 NaN 8 change JavaScript 8 NaN 9 change SQL 9 NaN 10 change Objective-C df_new['Sep 2019'] = range(10) # 给新的列标签赋值 type(df['Sep 2018']) pandas.core.series.Series df_new Sep 2019 Sep 2018 Change Programming Language 0 0 1 NaN Java 1 1 2 NaN C 2 2 3 change Python 3 3 4 change C++ 4 4 5 change Visual Basic .NET 5 5 6 change C# 6 6 7 change PHP 7 7 8 change JavaScript 8 8 9 change SQL 9 9 10 change Objective-C
pandas_DataFrame常用读取各种格式的数据操作
import numpy as np import pandas as pd from pandas import Series, DataFrame import webbrowser link = 'https://pandas.pydata.org/pandas-docs/version/0.20/io.html' webbrowser.open(link) True df1 = pd.read_clipboard() # 读取粘贴板的内容 df1 Format Type Data Description Reader Writer 0 text CSV read_csv to_csv 1 text JSON read_json to_json 2 text HTML read_html to_html 3 text Local clipboard read_clipboard to_clipboard 4 binary MS Excel read_excel to_excel 5 binary HDF5 Format read_hdf to_hdf 6 binary Feather Format read_feather to_feather 7 binary Msgpack read_msgpack to_msgpack 8 binary Stata read_stata to_stata 9 binary SAS read_sas None 10 binary Python Pickle Format read_pickle to_pickle 11 SQL SQL read_sql to_sql 12 SQL Google Big Query read_gbq to_gbq df1.to_clipboard() df1.to_csv('df1.csv',index=False) # 把内容写到名为df1.csv的文件中,把索引序号去除 !type df1.csv # window安装的jupyter使用该操作查看文件内容 Format Type,Data Description,Reader,Writer text,CSV,read_csv,to_csv text,JSON,read_json,to_json text,HTML,read_html,to_html text,Local clipboard,read_clipboard,to_clipboard binary,MS Excel,read_excel,to_excel binary,HDF5 Format,read_hdf,to_hdf binary,Feather Format,read_feather,to_feather binary,Msgpack,read_msgpack,to_msgpack binary,Stata,read_stata,to_stata binary,SAS,read_sas, binary,Python Pickle Format,read_pickle,to_pickle SQL,SQL,read_sql,to_sql SQL,Google Big Query,read_gbq,to_gbq df2 = pd.read_csv('df1.csv') # 在通过pd.read_csv读出来 df2 Format Type Data Description Reader Writer 0 text CSV read_csv to_csv 1 text JSON read_json to_json 2 text HTML read_html to_html 3 text Local clipboard read_clipboard to_clipboard 4 binary MS Excel read_excel to_excel 5 binary HDF5 Format read_hdf to_hdf 6 binary Feather Format read_feather to_feather 7 binary Msgpack read_msgpack to_msgpack 8 binary Stata read_stata to_stata 9 binary SAS read_sas NaN 10 binary Python Pickle Format read_pickle to_pickle 11 SQL SQL read_sql to_sql 12 SQL Google Big Query read_gbq to_gbq df1.to_json() '{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":{"0":"CSV","1":"JSON","2":"HTML","3":"Local clipboard","4":"MS Excel","5":"HDF5 Format","6":"Feather Format","7":"Msgpack","8":"Stata","9":"SAS","10":"Python Pickle Format","11":"SQL","12":"Google Big Query"},"Reader":{"0":"read_csv","1":"read_json","2":"read_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"read_pickle","11":"read_sql","12":"read_gbq"},"Writer":{"0":"to_csv","1":"to_json","2":"to_html","3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":"to_msgpack","8":"to_stata","9":null,"10":"to_pickle","11":"to_sql","12":"to_gbq"}}' pd.read_json(df1.to_json()) Format Type Data Description Reader Writer 0 text CSV read_csv to_csv 1 text JSON read_json to_json 10 binary Python Pickle Format read_pickle to_pickle 11 SQL SQL read_sql to_sql 12 SQL Google Big Query read_gbq to_gbq 2 text HTML read_html to_html 3 text Local clipboard read_clipboard to_clipboard 4 binary MS Excel read_excel to_excel 5 binary HDF5 Format read_hdf to_hdf 6 binary Feather Format read_feather to_feather 7 binary Msgpack read_msgpack to_msgpack 8 binary Stata read_stata to_stata 9 binary SAS read_sas None df1.to_html('df1.html') !dir 驱动器 C 中的卷是 OS 卷的序列号是 E28F-E81C C:\Users\Admin\Documents\jupyter notebook\数据挖掘学习 的目录 2018/10/01 16:58 <DIR> . 2018/10/01 16:58 <DIR> .. 2018/10/01 16:34 <DIR> .ipynb_checkpoints 2018/10/01 16:47 508 df1.csv 2018/10/01 16:58 2,061 df1.html 2018/09/30 16:22 8,896 numpy_array.ipynb 2018/09/30 17:03 7,648 numpy_Array的input和output.ipynb 2018/09/30 21:33 154,056 numpy_example.ipynb 2018/10/01 13:11 20,844 numpy_example1.ipynb 2018/09/30 16:58 570 numpy_two_file.npz 2018/09/30 16:52 168 numpy_wb_array.npy 2018/09/30 16:31 13,759 numpy_数组与矩阵的运算.ipynb 2018/10/01 16:01 16,335 pandas_Dataframe.ipynb 2018/10/01 16:57 19,694 pandas_DataFrame_IO.ipynb 2018/10/01 15:42 9,681 pandas_series.ipynb 2018/09/30 16:45 195 x.pkl 2018/10/01 16:34 10,325 深入理解Series和DataFrame.ipynb 14 个文件 264,740 字节 3 个目录 37,539,487,744 可用字节 df1.to_excel('df1.xlsx',index=False) 还有很多其他的格式的读写操作,这里不一一列举了
Series和Dataframe的排序
import numpy as np import pandas as pd from pandas import Series,DataFrame # Series的排序 s1 = Series(np.random.randn(10)) s1 0 -0.424930 1 -0.279580 2 -0.763344 3 -0.018883 4 -0.217693 5 -1.367801 6 -1.402714 7 -1.125427 8 -0.560325 9 0.037897 dtype: float64 s1.values array([-0.42493031, -0.27958011, -0.76334433, -0.01888319, -0.21769267, -1.36780083, -1.40271395, -1.1254272 , -0.56032516, 0.03789725]) s1.index RangeIndex(start=0, stop=10, step=1) s2 = s1.sort_values(ascending=False) # 通过值排序,默认是升序 s2 9 0.037897 3 -0.018883 4 -0.217693 1 -0.279580 0 -0.424930 8 -0.560325 2 -0.763344 7 -1.125427 5 -1.367801 6 -1.402714 dtype: float64 s2.sort_index(ascending=False) # 通过index排序,默认升序 9 0.037897 8 -0.560325 7 -1.125427 6 -1.402714 5 -1.367801 4 -0.217693 3 -0.018883 2 -0.763344 1 -0.279580 0 -0.424930 dtype: float64 # DataFrame的排序 df1 = DataFrame(np.random.randn(40).reshape(8,5),columns=['A','B','C','D','E']) df1 A B C D E 0 -0.971041 -1.006900 1.824719 0.362461 0.010793 1 -1.202866 -0.520920 0.603463 -2.222645 0.085995 2 0.803201 1.664502 0.866629 -3.014892 -0.026222 3 -2.737011 -0.844403 -0.565923 0.956517 2.366085 4 0.183618 -0.731790 1.449652 0.440694 0.749678 5 -0.852958 -2.110449 -0.872135 2.816473 0.402317 6 -0.064767 -0.461569 -1.090588 -1.441483 0.029359 7 -0.620219 -0.350775 -0.190072 -1.475141 -1.147963 # 可以通colums的值进行排序也可以按照index排序 df1['A'].sort_values() # 只调整一列series,无法调整根据要求调整整个dataframe 3 -2.737011 1 -1.202866 0 -0.971041 5 -0.852958 7 -0.620219 6 -0.064767 4 0.183618 2 0.803201 Name: A, dtype: float64 df2 = df1.sort_values('A') # 这次可以整体排序 df2 A B C D E 3 -2.737011 -0.844403 -0.565923 0.956517 2.366085 1 -1.202866 -0.520920 0.603463 -2.222645 0.085995 0 -0.971041 -1.006900 1.824719 0.362461 0.010793 5 -0.852958 -2.110449 -0.872135 2.816473 0.402317 7 -0.620219 -0.350775 -0.190072 -1.475141 -1.147963 6 -0.064767 -0.461569 -1.090588 -1.441483 0.029359 4 0.183618 -0.731790 1.449652 0.440694 0.749678 2 0.803201 1.664502 0.866629 -3.014892 -0.026222 df2.sort_index() # 应该和df1相同 A B C D E 0 -0.971041 -1.006900 1.824719 0.362461 0.010793 1 -1.202866 -0.520920 0.603463 -2.222645 0.085995 2 0.803201 1.664502 0.866629 -3.014892 -0.026222 3 -2.737011 -0.844403 -0.565923 0.956517 2.366085 4 0.183618 -0.731790 1.449652 0.440694 0.749678 5 -0.852958 -2.110449 -0.872135 2.816473 0.402317 6 -0.064767 -0.461569 -1.090588 -1.441483 0.029359 7 -0.620219 -0.350775 -0.190072 -1.475141 -1.147963 df1 A B C D E 0 -0.971041 -1.006900 1.824719 0.362461 0.010793 1 -1.202866 -0.520920 0.603463 -2.222645 0.085995 2 0.803201 1.664502 0.866629 -3.014892 -0.026222 3 -2.737011 -0.844403 -0.565923 0.956517 2.366085 4 0.183618 -0.731790 1.449652 0.440694 0.749678 5 -0.852958 -2.110449 -0.872135 2.816473 0.402317 6 -0.064767 -0.461569 -1.090588 -1.441483 0.029359 7 -0.620219 -0.350775 -0.190072 -1.475141 -1.147963

浙公网安备 33010602011771号