padas 操作及数据库2

Series的创建

使用Python的列表创建Series:

import numpy as np
import pandas as pd
 
# 使用list创建
s1 = pd.Series([1,2,3,4]) # 可以发现索引index默认从0开始进行自动索引
s1
0    1
1    2
2    3
3    4
dtype: int64
 
# 值属性,可以方便查看Series的值
s1.values
array([1, 2, 3, 4], dtype=int64)
 
# 索引index属性,返回的是索引从开始到结束和间隔的值
s1.index 
RangeIndex(start=0, stop=4, step=1)
 
 
# 使用numpy的数组进行创建
s2 = pd.Series(np.arange(10))
s2
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32
 
 
 
# 通过字典进行创建
s3 = pd.Series({'1':1, '2':2, '3':3})
s3
1    1
2    2
3    3
dtype: int64
 
s3.values
array([1, 2, 3], dtype=int64)
 
s3.index
Index(['1', '2', '3'], dtype='object')
 
# 手动赋值索引
s4 = pd.Series([1,2,3,4], index=['A','B','C','D'])
s4
A    1
B    2
C    3
D    4
dtype: int64
 
 
s4.values
array([1, 2, 3, 4], dtype=int64)
s4.index
Index(['A', 'B', 'C', 'D'], dtype='object')
s4['A']  # 根据索引取值
1
s4[s4>1] # 根据值得范围取值
B    2
C    3
D    4
dtype: int64
 
s4.to_dict() # 把Series转换为字典输出,也就是说可以通过字典创建Series,也可以通过Series转换为字典
{'A': 1, 'B': 2, 'C': 3, 'D': 4}
 
s5 = pd.Series(s4.to_dict()) # 来回转
s5
A    1
B    2
C    3
D    4
dtype: int64
 
index_1 = ['A','B','C','D','E'] # 可单独把索引写出,再赋值给Series,同时多增加一个索引
s6 = pd.Series(s5, index=index_1)
s6  # 多增加的索引的值为NAN
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
dtype: float64
 
pd.isnull(s6) # 根据pd.isnall()判断Series的元素是否有空值,如果有返回Ture,反之False
A    False
B    False
C    False
D    False
E     True
dtype: bool
 
pd.notnull(s6) # 类似的操作 
A     True
B     True
C     True
D     True
E    False
dtype: bool
 
s6.name = 'demo' # 给Series赋予名字
s6
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
Name: demo, dtype: float64
 
s6.index.name = 'demo_index' # 给索引起个名字
s6
demo_index
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
Name: demo, dtype: float64
 
s6.index 
Index(['A', 'B', 'C', 'D', 'E'], dtype='object', name='demo_index')
 

 

pandas的Dataframe

pandas的Dataframe学习
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
 
import webbrowser
link = 'https://www.tiobe.com//tiobe-index/'
webbrowser.open(link)
True
 
df = pd.read_clipboard() # 读取粘贴板的内容
df
Sep    2018    Sep.1    2017    Change    Programming    Language    Ratings    Change.1
0    0    1    1    NaN    Java    17.436%    +4.75%    NaN    NaN
1    1    2    2    NaN    C    15.447%    +8.06%    NaN    NaN
2    2    3    5    change    Python    7.653%    +4.67%    NaN    NaN
3    3    4    3    change    C++    7.394%    +1.83%    NaN    NaN
4    4    5    8    change    Visual    Basic    .NET    5.308%    +3.33%
5    5    6    4    change    C#    3.295%    -1.48%    NaN    NaN
6    6    7    6    change    PHP    2.775%    +0.57%    NaN    NaN
7    7    8    7    change    JavaScript    2.131%    +0.11%    NaN    NaN
8    8    9    -    change    SQL    2.062%    +2.06%    NaN    NaN
9    9    10    18    change    Objective-C    1.509%    +0.00%    NaN    NaN
 
type(df)
pandas.core.frame.DataFrame
 
df.columns # 读取索引,该操作作用很大,可以读取行的索引,以此可以通过该索引进行访问
Index(['Sep', '2018', 'Sep.1', '2017', 'Change', 'Programming', 'Language',
       'Ratings', 'Change.1'],
      dtype='object')
 
df.Ratings # 通过标签读取该列内容
0    17.436%
1    15.447%
2     7.653%
3     7.394%
4     5.308%
5     3.295%
6     2.775%
7     2.131%
8     2.062%
9     1.509%
Name: Ratings, dtype: object
 
# 从原先的dataframe中提取想要的数据,并赋给新的dataframe,该操作很有用
df_new = DataFrame(df, columns=['Sep 2019','Sep 2018', 'Change', 'Programming Language'])
# 同时增加新的标签,并自动赋值nan
df_new
Sep 2019    Sep 2018    Change    Programming Language
0    NaN    1    NaN    Java
1    NaN    2    NaN    C
2    NaN    3    change    Python
3    NaN    4    change    C++
4    NaN    5    change    Visual Basic .NET
5    NaN    6    change    C#
6    NaN    7    change    PHP
7    NaN    8    change    JavaScript
8    NaN    9    change    SQL
9    NaN    10    change    Objective-C
 
df_new['Sep 2019'] = range(10) # 给新的列标签赋值
type(df['Sep 2018'])
pandas.core.series.Series
df_new
Sep 2019    Sep 2018    Change    Programming Language
0    0    1    NaN    Java
1    1    2    NaN    C
2    2    3    change    Python
3    3    4    change    C++
4    4    5    change    Visual Basic .NET
5    5    6    change    C#
6    6    7    change    PHP
7    7    8    change    JavaScript
8    8    9    change    SQL
9    9    10    change    Objective-C

pandas_DataFrame常用读取各种格式的数据操作

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
 
import webbrowser
link = 'https://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)
True
 
df1 = pd.read_clipboard() # 读取粘贴板的内容
 
df1
Format Type    Data Description    Reader    Writer
0    text    CSV    read_csv    to_csv
1    text    JSON    read_json    to_json
2    text    HTML    read_html    to_html
3    text    Local clipboard    read_clipboard    to_clipboard
4    binary    MS Excel    read_excel    to_excel
5    binary    HDF5 Format    read_hdf    to_hdf
6    binary    Feather Format    read_feather    to_feather
7    binary    Msgpack    read_msgpack    to_msgpack
8    binary    Stata    read_stata    to_stata
9    binary    SAS    read_sas    None
10    binary    Python Pickle Format    read_pickle    to_pickle
11    SQL    SQL    read_sql    to_sql
12    SQL    Google Big Query    read_gbq    to_gbq
 
df1.to_clipboard()
df1.to_csv('df1.csv',index=False) # 把内容写到名为df1.csv的文件中,把索引序号去除
 
!type df1.csv # window安装的jupyter使用该操作查看文件内容
Format Type,Data Description,Reader,Writer
text,CSV,read_csv,to_csv
text,JSON,read_json,to_json
text,HTML,read_html,to_html
text,Local clipboard,read_clipboard,to_clipboard
binary,MS Excel,read_excel,to_excel
binary,HDF5 Format,read_hdf,to_hdf
binary,Feather Format,read_feather,to_feather
binary,Msgpack,read_msgpack,to_msgpack
binary,Stata,read_stata,to_stata
binary,SAS,read_sas,
binary,Python Pickle Format,read_pickle,to_pickle
SQL,SQL,read_sql,to_sql
SQL,Google Big Query,read_gbq,to_gbq
 
df2 = pd.read_csv('df1.csv') # 在通过pd.read_csv读出来
df2
Format Type    Data Description    Reader    Writer
0    text    CSV    read_csv    to_csv
1    text    JSON    read_json    to_json
2    text    HTML    read_html    to_html
3    text    Local clipboard    read_clipboard    to_clipboard
4    binary    MS Excel    read_excel    to_excel
5    binary    HDF5 Format    read_hdf    to_hdf
6    binary    Feather Format    read_feather    to_feather
7    binary    Msgpack    read_msgpack    to_msgpack
8    binary    Stata    read_stata    to_stata
9    binary    SAS    read_sas    NaN
10    binary    Python Pickle Format    read_pickle    to_pickle
11    SQL    SQL    read_sql    to_sql
12    SQL    Google Big Query    read_gbq    to_gbq
 
df1.to_json()
'{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":{"0":"CSV","1":"JSON","2":"HTML","3":"Local clipboard","4":"MS Excel","5":"HDF5 Format","6":"Feather Format","7":"Msgpack","8":"Stata","9":"SAS","10":"Python Pickle Format","11":"SQL","12":"Google Big Query"},"Reader":{"0":"read_csv","1":"read_json","2":"read_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"read_pickle","11":"read_sql","12":"read_gbq"},"Writer":{"0":"to_csv","1":"to_json","2":"to_html","3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":"to_msgpack","8":"to_stata","9":null,"10":"to_pickle","11":"to_sql","12":"to_gbq"}}'
 
pd.read_json(df1.to_json())
Format Type    Data Description    Reader    Writer
0    text    CSV    read_csv    to_csv
1    text    JSON    read_json    to_json
10    binary    Python Pickle Format    read_pickle    to_pickle
11    SQL    SQL    read_sql    to_sql
12    SQL    Google Big Query    read_gbq    to_gbq
2    text    HTML    read_html    to_html
3    text    Local clipboard    read_clipboard    to_clipboard
4    binary    MS Excel    read_excel    to_excel
5    binary    HDF5 Format    read_hdf    to_hdf
6    binary    Feather Format    read_feather    to_feather
7    binary    Msgpack    read_msgpack    to_msgpack
8    binary    Stata    read_stata    to_stata
9    binary    SAS    read_sas    None
df1.to_html('df1.html')
!dir
 驱动器 C 中的卷是 OS
 卷的序列号是 E28F-E81C
 
 C:\Users\Admin\Documents\jupyter notebook\数据挖掘学习 的目录
 
2018/10/01  16:58    <DIR>          .
2018/10/01  16:58    <DIR>          ..
2018/10/01  16:34    <DIR>          .ipynb_checkpoints
2018/10/01  16:47               508 df1.csv
2018/10/01  16:58             2,061 df1.html
2018/09/30  16:22             8,896 numpy_array.ipynb
2018/09/30  17:03             7,648 numpy_Array的input和output.ipynb
2018/09/30  21:33           154,056 numpy_example.ipynb
2018/10/01  13:11            20,844 numpy_example1.ipynb
2018/09/30  16:58               570 numpy_two_file.npz
2018/09/30  16:52               168 numpy_wb_array.npy
2018/09/30  16:31            13,759 numpy_数组与矩阵的运算.ipynb
2018/10/01  16:01            16,335 pandas_Dataframe.ipynb
2018/10/01  16:57            19,694 pandas_DataFrame_IO.ipynb
2018/10/01  15:42             9,681 pandas_series.ipynb
2018/09/30  16:45               195 x.pkl
2018/10/01  16:34            10,325 深入理解Series和DataFrame.ipynb
              14 个文件        264,740 字节
               3 个目录 37,539,487,744 可用字节
df1.to_excel('df1.xlsx',index=False)
还有很多其他的格式的读写操作,这里不一一列举了

Series和Dataframe的排序

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
 
# Series的排序
s1 = Series(np.random.randn(10))
s1
0   -0.424930
1   -0.279580
2   -0.763344
3   -0.018883
4   -0.217693
5   -1.367801
6   -1.402714
7   -1.125427
8   -0.560325
9    0.037897
dtype: float64
 
s1.values
array([-0.42493031, -0.27958011, -0.76334433, -0.01888319, -0.21769267,
       -1.36780083, -1.40271395, -1.1254272 , -0.56032516,  0.03789725])
 
s1.index
RangeIndex(start=0, stop=10, step=1)
 
s2 = s1.sort_values(ascending=False) # 通过值排序,默认是升序
s2
9    0.037897
3   -0.018883
4   -0.217693
1   -0.279580
0   -0.424930
8   -0.560325
2   -0.763344
7   -1.125427
5   -1.367801
6   -1.402714
dtype: float64
 
s2.sort_index(ascending=False) # 通过index排序,默认升序
9    0.037897
8   -0.560325
7   -1.125427
6   -1.402714
5   -1.367801
4   -0.217693
3   -0.018883
2   -0.763344
1   -0.279580
0   -0.424930
dtype: float64
 
# DataFrame的排序
 
df1 = DataFrame(np.random.randn(40).reshape(8,5),columns=['A','B','C','D','E'])
df1
A    B             C                         D             E
0    -0.971041    -1.006900    1.824719    0.362461    0.010793
1    -1.202866    -0.520920    0.603463    -2.222645    0.085995
2    0.803201    1.664502    0.866629    -3.014892    -0.026222
3    -2.737011    -0.844403    -0.565923    0.956517    2.366085
4    0.183618    -0.731790    1.449652    0.440694    0.749678
5    -0.852958    -2.110449    -0.872135    2.816473    0.402317
6    -0.064767    -0.461569    -1.090588    -1.441483    0.029359
7    -0.620219    -0.350775    -0.190072    -1.475141    -1.147963
 # 可以通colums的值进行排序也可以按照index排序
    
df1['A'].sort_values()  # 只调整一列series,无法调整根据要求调整整个dataframe
3   -2.737011
1   -1.202866
0   -0.971041
5   -0.852958
7   -0.620219
6   -0.064767
4    0.183618
2    0.803201
Name: A, dtype: float64
 
df2 = df1.sort_values('A') # 这次可以整体排序
df2
A    B    C    D    E
3    -2.737011    -0.844403    -0.565923    0.956517    2.366085
1    -1.202866    -0.520920    0.603463    -2.222645    0.085995
0    -0.971041    -1.006900    1.824719    0.362461    0.010793
5    -0.852958    -2.110449    -0.872135    2.816473    0.402317
7    -0.620219    -0.350775    -0.190072    -1.475141    -1.147963
6    -0.064767    -0.461569    -1.090588    -1.441483    0.029359
4    0.183618    -0.731790    1.449652    0.440694    0.749678
2    0.803201    1.664502    0.866629    -3.014892    -0.026222
 
df2.sort_index() # 应该和df1相同
A    B    C    D    E
0    -0.971041    -1.006900    1.824719    0.362461    0.010793
1    -1.202866    -0.520920    0.603463    -2.222645    0.085995
2    0.803201    1.664502    0.866629    -3.014892    -0.026222
3    -2.737011    -0.844403    -0.565923    0.956517    2.366085
4    0.183618    -0.731790    1.449652    0.440694    0.749678
5    -0.852958    -2.110449    -0.872135    2.816473    0.402317
6    -0.064767    -0.461569    -1.090588    -1.441483    0.029359
7    -0.620219    -0.350775    -0.190072    -1.475141    -1.147963
df1
A    B    C    D    E
0    -0.971041    -1.006900    1.824719    0.362461    0.010793
1    -1.202866    -0.520920    0.603463    -2.222645    0.085995
2    0.803201    1.664502    0.866629    -3.014892    -0.026222
3    -2.737011    -0.844403    -0.565923    0.956517    2.366085
4    0.183618    -0.731790    1.449652    0.440694    0.749678
5    -0.852958    -2.110449    -0.872135    2.816473    0.402317
6    -0.064767    -0.461569    -1.090588    -1.441483    0.029359
7    -0.620219    -0.350775    -0.190072    -1.475141    -1.147963

 

posted @ 2021-05-26 12:54  7411  阅读(324)  评论(0)    收藏  举报