Pandas：Index使用练习

下文操作案例均来自: https://pandas.pydata.org/pandas-docs/stable/reference/indexing.html

一. Index

1. 创建索引以及索引属性

import pandas as pd
import numpy as np
  
def create_index():
    # 创建索引对象
    index = pd.Index([1, 2, 3, 4, 5], name='test')
    print(index)                          #                              Index([1, 2, 3, 4, 5], dtype='int64', name='test')
    # 属性
    print(index.T)                        # 返回转置                      Index([1, 2, 3, 4, 5], dtype='int64', name='test') <NumpyExtensionArray>
    print(index.array)                    # 支持的ExtensionArray          [1, 2, 3, 4, 5] Length: 5, dtype: int64
    print(index.dtype)                    # 返回基础元素的dtype对象         int64
    print(index.empty)                    # 是否为空                      False
    print(index.has_duplicates)           # 是否有重复的值                 False
    print(index.hasnans)                  # 是否有nan                     False
    print(index.inferred_type)            # 返回从值推断出的类型的字符串      integer
    print(index.is_monotonic_decreasing)  # 是否单调递减                   False
    print(index.is_monotonic_increasing)  # 是否单调递增                   True
    print(index.is_unique)                # 值是否具有唯一性                True
    print(index.name)                     # 名称                          test
    print(index.nbytes)                   # 返回基础数据总字节数             40
    print(index.ndim)                     # 返回基础数据维数                1
    print(index.nlevels)                  # 返回层数                       1
    print(index.shape)                    # 形状元组                       (5,)
    print(index.size)                     # 元素数量                       5
    print(index.values)                   # 表示索引数据的数组              [1 2 3 4 5]

2. 修改和计算函数

def modify_and_computations():
    index = pd.Index([1, 2, 3, 4, 5], name='test')
    # 所有元素都是True则返回True，否则返回False
    print(index.all())                                           # True
    # 有一个元素师True则返回True，否则返回False
    print(index.any())                                           # True
    # 返回最小元素对应的索引
    print(index.argmin())                                        # 0
    # 返回最大元素对应的索引
    print(index.argmax())                                        # 4
    # 返回一个对象的拷贝
    print(index.copy())                                          # Index([1, 2, 3, 4, 5], dtype='int64', name='test')
    # 删除指定索引位置，创建一个新的索引对象
    print(index.delete(0))                                       # Index([2, 3, 4, 5], dtype='int64', name='test')
    # 删除指定索引，创建一个新的索引对象
    print(index.drop(5))                                         # Index([1, 2, 3, 4], dtype='int64', name='test')
    # 索引值去重
    print(pd.Index([1, 1, 1, 2, 2, 2]).drop_duplicates())        # Index([1, 2], dtype='int64')
    # 标识重复索引位置
    print(pd.Index([1, 1, 2, 3, 3]).duplicated())                # [False  True False False  True]
    # 判断两个索引对象是否相同
    print(index.equals(pd.Index([1, 2, 3, 4, 5.0])))             # True
    # 将对象编码为枚举类型或者分类对象
    # (array([0, 0, 1, 2, 2], dtype=int64), Index(['a', 'b', 'c'], dtype='object'))
    print(pd.Index(['a', 'a', 'b', 'c', 'c']).factorize())
    # 类似于equals，同时会检查属性和类型是否也相等
    print(index.identical(pd.Index([1, 2, 3, 4, 5.0])))          # False
    # 在指定位置插入指定内容，创建一个新的索引
    print(index.insert(2, 2))                           # Index([1, 2, 2, 3, 4, 5], dtype='int64', name='test')
  
    idx = pd.Index(list('ABCDFEMD'))
    # 检查是否相同，适用于视图
    print(idx.is_(idx.view()), idx.is_(idx.copy()))              # True False
    # 检查索引是否由布尔值组成
    print(index.is_boolean())                                    # False
    # 检查索引是否包含分类数据
    print(index.is_categorical())                                # False
    # 检查索引是否为浮动类型
    print(index.is_floating())                                   # False
    # 检查索引是否由整数组成
    print(index.is_integer())                                    # True
    # 检查索引是否包含interval对象
    print(index.is_interval())                                   # False
    # 检查索引是否仅由数字数据组成
    print(index.is_numeric())                                    # True
    # 检查索引是否为对象数据类型
    print(index.is_object())                                     # False
    # 求最大值
    print(index.max())                                           # 5
    # 求最小值
    print(index.min())                                           # 1
    # 使用目标值创建索引
    # (Index([1, 123, 3, 4], dtype='int64'), array([0, 7, 2, 3], dtype=int64))
    print(pd.Index([1, 2, 3, 4, 34, 51, 23, 123, 12321]).reindex(pd.Index([1, 123, 3, 4])))
    # 重命名
    print(index.rename("test2"))                                 # Index([1, 2, 3, 4, 5], dtype='int64', name='test2')
    # 重复索引中的元素生成新的索引，传入单个值则所有元素都重复相同次数
    print(index.repeat(2))                         # Index([1, 1, 2, 2, 3, 3, 4, 4, 5, 5], dtype='int64', name='test')
    # 传入与索引元素个数相同的列表，则每个元素按照对应个数重复
    # Index([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5], dtype='int64', name='test')
    print(index.repeat([1, 2, 3, 4, 5]))
    # 如果condition是false，则使用指定的值替换索引
    print(index.where(index > 2, 2))                       # Index([2, 2, 3, 4, 5], dtype='int64', name='test')
    # 返回由指定位置值组成的新索引
    print(pd.Index(['a', 'b', 'c']).take([0, 2]))                # Index(['a', 'c'], dtype='object')
    # 返回使用掩码设置值的新索引，掩码值为1，则使用传入的索引对应位置的值，掩码值为0，使用原索引的值
    print(index.putmask(np.array([1, 1, 1, 0, 0]),
                        pd.Index([5, 4, 3, 2, 1])))              # Index([5, 4, 3, 4, 5], dtype='int64', name='test')
    # 返回索引中的唯一值
    print(pd.Index([1, 1, 1, 2, 2, 2]).unique())                 # Index([1, 2], dtype='int64')
    # 返回对象中唯一元素的数量
    print(index.nunique())                                       # 5
    # 返回一个包含唯一值计数的序列
    # 1    3
    # 2    3
    # Name: count, dtype: int64
    print(pd.Index([1, 1, 1, 2, 2, 2]).value_counts())

3. multiIndex兼容性函数

def compatibility_with_multi_index():
    # 设置索引名称
    index = pd.Index([1, 2, 3, 4, 5], name='test')
    index.set_names('test3')
    # Index([1, 2, 3, 4, 5], dtype='int64', name='test')
    print(index)
    # 删除指定级别的索引
    # Index([3, 4], dtype='int64', name='y')
    print(pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']).droplevel(0))

4.缺失值相关函数

def missing_values():
    index = pd.Index([1, 2, 3, 4, 5], name='test')
    # 使用指定值填充NA/NaN
    print(pd.Index([None, 1, 2, np.nan]).fillna(0))          # Index([0.0, 1.0, 2.0, 0.0], dtype='float64')
    # 删除NA/NaN,并返回索引
    print(pd.Index([1, 1, None, np.nan]).dropna())           # Index([1.0, 1.0], dtype='float64')
    # 检测缺失值
    print(pd.Index([None, np.nan, 1, 2]).isna())             # [ True  True False False]
    # 检测非缺失值
    print(index.notna())                                     # [ True  True  True  True  True]

5.转换

def conversion():
    index = pd.Index([1, 2, 3, 4, 5], name='test')
    # 创建一个将值转换为指定dtypes的索引 
    print(index.astype(float))                # Index([1.0, 2.0, 3.0, 4.0, 5.0], dtype='float64', name='test')
    # 将基础数据的第一个元素作为Python标量返回。 必须length为1
    print(pd.Index(['a']).item(), pd.Index([1]).item())   # a 1 
    # 使用指定的字典或者函数对index的值进行转换
    print(index.map({1: 'a', 2: 'b'}))        # Index(['a', 'b', nan, nan, nan], dtype='object', name='test')
    # 返回一个视图
    print(index.ravel())                      # Index([1, 2, 3, 4, 5], dtype='int64', name='test')
    # 返回值的列表
    print(index.to_list())                    # [1, 2, 3, 4, 5]
    # 返回值的列表
    print(index.tolist())                     # [1, 2, 3, 4, 5]
    # 创建一个包含索引列的dataframe
    #   x
    # test   
    # 1     1
    # 2     2
    # 3     3
    # 4     4
    # 5     5
    print(index.to_frame(0, 'x'))
    # 创建一个索引和值都等于索引键的序列。
    # test
    # 1    1
    # 2    2
    # 3    3
    # 4    4
    # 5    5
    # Name: test, dtype: int64
    print(index.to_series())
    # 返回一个视图
    print(index.view())                      # Index([1, 2, 3, 4, 5], dtype='int64', name='test')

6.排序

def sorting():
    index = pd.Index([11, 2, 3, 4, 5], name='test')
    # 返回对索引进行排序的整数索引列表
    print(index.argsort())                                                         # [1 2 3 4 0]
    # 查找应插入元素以保持顺序的索引。就是插入指定值到哪个位置，索引的大小顺序不变。
    print(index.searchsorted(4))                                                   # 3
    print(index.searchsorted(123))                                                 # 5
    print(pd.Index([1, 1, 1, 1, 2, 2, 2, 2]).searchsorted(1, 'right'))  # 4
    print(pd.Index([1, 1, 1, 1, 2, 2, 2, 2]).searchsorted(1))      # 0
    # 返回索引的排序副本、正序
    print(pd.Index([1, 3, 2, 4, 0]).sort_values())                 # Index([0, 1, 2, 3, 4], dtype='int64')
    # 倒序
    print(pd.Index([1, 3, 2, 4, 0]).sort_values(ascending=False))  # Index([4, 3, 2, 1, 0], dtype='int64')

7.时间操作

def time_specific_operations():
    import datetime
    # 按所需的时间频率增量数移动索引。   DatetimeIndex(['2024-01-02', '2024-01-03'], dtype='datetime64[ns]', freq=None)
    # 只能给DatetimeIndex使用
    print(pd.DatetimeIndex([datetime.datetime(2024, 1, 1),
                            datetime.datetime(2024, 1, 2)]).shift(1, 'd'))

8.组合、连接、设置操作

def combining_joining_set_operations():
    index = pd.Index([1, 2, 3, 4, 5], name='test')
    # 将一组索引选项追加到索引
    print(index.append(pd.Index([6])))                                    # Index([1, 2, 3, 4, 5, 6], dtype='int64')
    # 连接器，可选项有left\right\inner\outer
    print(pd.Index([1, 2, 3, 4]).join(pd.Index([3, 4, 5]), how='left'))   # Index([1, 2, 3, 4], dtype='int64')
    print(pd.Index([1, 2, 3, 4]).join(pd.Index([3, 4, 5]), how='right'))  # Index([3, 4, 5], dtype='int64')
    print(pd.Index([1, 2, 3, 4]).join(pd.Index([3, 4, 5]), how='inner'))  # Index([3, 4], dtype='int64')
    print(pd.Index([1, 2, 3, 4]).join(pd.Index([3, 4, 5]), how='outer'))  # Index([1, 2, 3, 4, 5], dtype='int64')
    # 求两个index的交集
    print(index.intersection(pd.Index([1, 2, 3, 6, 7, 8])))               # Index([1, 2, 3], dtype='int64')
    # 求两个index的并集
    print(index.union(pd.Index([11, 1])))                                 # Index([1, 2, 3, 4, 5, 11], dtype='int64')
    # 求不在指定索引的值组成的新索引
    print(index.difference(pd.Index([2, 3, 4, 5])))                       # Index([1], dtype='int64')
    # 计算对称差，就是将不在两个索引中的值组成一个新的索引
    print(pd.Index([1, 3, 4]).symmetric_difference(pd.Index([3, 4, 5])))  # Index([1, 2, 5, 6], dtype='int64')

9. 数据选择

def selecting():
    index = pd.Index([1, 2, 3, 4, 55], name='test')
    # 返回索引中的标签，如果没有，返回上一个。
    print(index.asof(9))                                                        # 4
    print(index.asof(0))                                                        # nan
    idx = pd.Index(list('ABCDFEMD'))
    # where参数必须是Index类型，可以是一个值，也可以是多个值
    where = pd.Index(['A', 'B', 'V'])
    mask = np.ones(len(idx), dtype=bool)
    # 查找向索引中插入where参数的元素而不引起索引的排序顺序变化的位置
    print(idx.asof_locs(where, mask))                                            # [0 1 7]
    # 根据当前索引计算出新索引所需的索引器和掩码，(1) 当前索引不能有重复 (2) 如果值不在当前索引，则返回-1
    print(pd.Index(['a', 'c', 'd', 'b']).get_indexer(pd.Index(['a', 'b', 'c', 'e'])))  # [ 0  3  1 -1]
    # 根据当前索引计算出新索引所需的索引器和掩码，可以有重复。
    # [ 0  3  1  4 -1]
    print(pd.Index(['a', 'c', 'd', 'b', 'c']).get_indexer_for(pd.Index(['a', 'b', 'c', 'e'])))  # 根据当前索引计算新索引
    # 根据当前索引计算出新索引所需的索引器和掩码，可以有重复; 同时返回不在当前索引中的值的位置
    # (array([ 0,  3, -1, -1], dtype=int64), array([2, 3], dtype=int64))
    print(pd.Index(['a', 'c', 'd', 'b', 'c']).get_indexer_non_unique(pd.Index(['a', 'b', 'f', 'e'])))
    # 返回指定level的索引
    print(index.get_level_values(0))                               # Index([1, 2, 3, 4, 55], dtype='int64', name='test')
    print(pd.MultiIndex.from_arrays([[1, 2, 3], [3, 4, 5]]).get_level_values(0))    # Index([1, 2, 3], dtype='int64')
    # 返回指定标签的位置
    print(index.get_loc(2))                                                     # 1
    print(pd.Index(['a', 'b', 'c']).get_loc('c'))                               # 2
    # 计算指定标签的切片边界, left和right分别表示从指定标签的哪一边开始计算
    print(pd.Index(['a', 'b', 'c']).get_slice_bound('b', 'left'))     # 1
    print(pd.Index(['a', 'b', 'c']).get_slice_bound('b', 'right'))    # 2
    # 返回一个bool数组，标签中的元素在指定values中的为True，否则为False
    print(index.isin([1, 2]))                                                    # [ True  True False False False]
    # 计算输入标签和步长的切片索引器。
    print(pd.Index(list('abcdef')).slice_indexer('a', 'c', 1))    # slice(0, 3, 1)
    # 计算输入标签的切片位置。
    print(pd.Index(list('abcdef')).slice_locs('a', 'c', 1))       # (0, 3)

10. Index对象的其他函数

def other():
    index = pd.Index([1, 2, 3, 4, 5], name='test')
    # 按顺序求差值
    print(index.diff())                                 # Index([nan, 1.0, 1.0, 1.0, 1.0], dtype='float64', name='test')
    # 返回一个索引的字符串表示形式
    print(index.format())                                       # ['1', '2', '3', '4', '5']
    # 判断索引的元素是否为None或者NaN
    print(pd.Index([None, np.nan, 1, 2]).isnull())              # [ True  True False False]
    # 返回索引占用的内存大小
    print(index.memory_usage())                                 # 40
    # 判断索引的元素是否不为None或者NaN
    print(index.notnull())                                      # [ True  True  True  True  True]
    # 对索引中的元素进行四舍五入
    print(pd.Index([1.12312, 1.323123, 2.3232]).round(2))       # Index([1.12, 1.32, 2.32], dtype='float64')
    # 返回排序后的索引，以及转成排序对应的索引器
    # (Index([0, 1, 2, 3, 4], dtype='int64'), array([4, 0, 2, 1, 3], dtype=int64))
    print(pd.Index([1, 3, 2, 4, 0]).sortlevel()) 
    # 与子类兼容实现的函数
    print(pd.Index([[1, 2], [4, 5]]).to_flat_index())           # Index([[1, 2], [4, 5]], dtype='object')
    # 求转置，对于Index对象来说，就是本身
    print(index.transpose())                                    # Index([1, 2, 3, 4, 5], dtype='int64', name='test')

二.RangeIndex索引

def numeric_index():
    # 创建单调整数范围内的不可变索引
    index = pd.RangeIndex(1, 10, 1)
    print(index)                                        # RangeIndex(start=1, stop=10, step=1)
    # start参数，默认值为0
    print(index.start)                                  # 1
    # stop参数
    print(index.stop)                                   # 10
    # step参数，默认值为1
    print(index.step)                                   # 1
    # 使用一个range对象创建RangeIndex
    idx = pd.RangeIndex.from_range(range(10))           # RangeIndex(start=0, stop=10, step=1)
    print(idx)

三.CategoricalIndex 分类索引

def categorical_index():
    # 基于类Categorical类创建的索引
    index = pd.CategoricalIndex(['a', 'a', 'c', 'c', 'b', 'b'])
    # CategoricalIndex(['a', 'a', 'c', 'c', 'b', 'b'], categories=['a', 'b', 'c'], ordered=False, dtype='category')
    print(index)
    # 分类索引的类别代码
    print(index.codes)                      # [0 0 2 2 1 1]
    # 分类索引的类别
    print(index.categories)                 # Index(['a', 'b', 'c'], dtype='object')
    # 类别是否具有有序关系
    print(index.ordered)                    # False
    # 重命名类别，注意，类别不能重复。
    new_index = index.rename_categories({'a': 'cc'})
    print(new_index.categories)             # Index(['cc', 'b', 'c'], dtype='object')
    # 设置索引的排序方式; 需要传入一个排序列表，不能少或者多元素。ordered参数用于修改index的ordered参数。
    new_index = index.reorder_categories(['c', 'a', 'b'], ordered=True)
    # CategoricalIndex(['c', 'c', 'a', 'a', 'b', 'b'], categories=['c', 'a', 'b'], ordered=True, dtype='category')
    print(new_index.sort_values())
    print(new_index.ordered)                # True
    # CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category')
    print(index.sort_values())
    # 添加分类
    index = index.add_categories(['d', 'e'])
    # CategoricalIndex(['a', 'a', 'c', 'c', 'b', 'b'], categories=['a', 'b', 'c', 'd', 'e'], ordered=False, dtype='category')
    print(index)
    # 删除分类
    index = index.remove_categories(['d'])
    # CategoricalIndex(['a', 'a', 'c', 'c', 'b', 'b'], categories=['a', 'b', 'c', 'e'], ordered=False, dtype='category')
    print(index)
    # 删除不使用的分类
    index = index.remove_unused_categories()
    # CategoricalIndex(['a', 'a', 'c', 'c', 'b', 'b'], categories=['a', 'b', 'c'], ordered=False, dtype='category')
    print(index)
    # 设置新的分类列表
    # CategoricalIndex([nan, nan, 'c', 'c', 'b', 'b'], categories=['b', 'c'], ordered=False, dtype='category')
    print(index.set_categories(['b', 'c']))
    print(index.ordered)                    # False
    # 设置成有序
    index = index.as_ordered()
    print(index.ordered)                    # True
    # 设置成无序
    index = index.as_unordered()
    print(index.ordered)                    # False
    # 使用输入映射或函数来映射值。没映射上的置为nan
    print(index.map({'a': 'aa'}))           # Index(['aa', 'aa', nan, nan, nan, nan], dtype='object')
    # 确定两个CategorialIndex对象是否包含相同的元素。完全相同才会返回True
    print(index.equals(pd.CategoricalIndex(['a', 'a', 'c', 'c', 'b', 'b'])))   # True

四.IntervalIndex 区间索引

def interval_index():
    # 在同一侧闭合的区间的不可变索引。
    index = pd.interval_range(start=0, end=5)
    # IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], dtype='interval[int64, right]')
    print(index)
    # 使用定义左右边界的数组创建interval索引
    # IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval[int64, right]')
    print(pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]))
    # 从类似数组的元组构造IntervalIndex。
    # IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]')
    print(pd.IntervalIndex.from_tuples([(0, 1), (1, 2)]))
    # 从拆分数组构造IntervalIndex。
    # IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval[int64, right]')
    print(pd.IntervalIndex.from_breaks([0, 1, 2, 3]))
    # 左边界
    print(index.left)      # Index([0, 1, 2, 3, 4], dtype='int64')
    # 右边界
    print(index.right)     # Index([1, 2, 3, 4, 5], dtype='int64')
    # 求中间值
    print(index.mid)       # Index([0.5, 1.5, 2.5, 3.5, 4.5], dtype='float64')
    # 闭区间位置,left、right
    print(index.closed)    # right
    print(index.length)    # Index([1, 1, 1, 1, 1], dtype='int64')
    # 返回一个表示索引中数据的数组。
    # <IntervalArray>
    # [(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]]
    # Length: 5, dtype: interval[int64, right]
    print(index.values)
    # 判断每个间隔元素是否为空
    print(index.is_empty)   # [False False False False False]
    # 判断是否非重复且单调的
    print(index.is_non_overlapping_monotonic)  # True
    print(pd.IntervalIndex.from_tuples([(0, 1), (0, 1)]).is_non_overlapping_monotonic)  # False
    # 是否有重叠
    print(index.is_overlapping)                # False
    print(pd.IntervalIndex.from_tuples([(0, 1), (0, 1)]).is_overlapping)    # True
    # 获取所请求标签的整数位置、切片或布尔掩码。
    print(index.get_loc(3))                    # 2
    print(index.get_loc(2.5))                  # 2
    # 根据当前索引计算新索引的索引器和掩码。
    print(index.get_indexer([1, 3, 4]))        # [0 2 3]
    # IntervalIndex([(0, 1], (2, 3], (3, 4]], dtype='interval[int64, right]')
    print(index[[0, 2, 3]])
    # 设置指定侧为闭区间
    # IntervalIndex([[0, 1), [1, 2), [2, 3), [3, 4), [4, 5)], dtype='interval[int64, left]')
    print(index.set_closed('left'))
    # 按元素检查间隔是否包含该值。
    print(index.contains(2))                    # [False  True False False False]
    print(index.contains(1.5))                  # [False  True False False False]
    # 检查重叠的区间
    print(index.overlaps(pd.Interval(1, 3)))   # [False  True  True False False]
    # 转换成由左右元素构成的元组数组或者索引
    # Index([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)], dtype='object')
    print(index.to_tuples())

五. MultiIndex

def multi_index():
    # 多级分层索引对象
    multi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['A', 'B'])
    # MultiIndex([(1, 3),
    #             (2, 4)],
    #            names=['A', 'B'])
    print(multi)
    # 返回多重索引的级别，由级别对应索引组成的数组。
    print(multi.levels)                        # [[1, 2], [3, 4]]
    print(multi.codes)                         # [[0, 1], [0, 1]]
    print(multi.sortorder)                     # None
    # 返回级别的名称。
    print(multi.names)                         # ['A', 'B']
    # 此多索引中的整数级别数。
    print(multi.nlevels)                       # 2
    # 具有每个级别长度的元组。
    print(multi.levshape)                      # (2, 2)
    # 返回索引每个级别的dtype类型组成的Series
    # A    int64
    # B    int64
    # dtype: object
    print(multi.dtypes)
    # 将元组列表转换为多索引。
    # MultiIndex([(1, 2),
    #             (2, 3)],
    #            names=['A', 'B'])
    print(pd.MultiIndex.from_tuples(((1, 2), (2, 3)), names=['A', 'B']))
    # 通过多个可迭代对象的笛卡尔积创建一个多重索引。
    # MultiIndex([(1, 2),
    #             (1, 3),
    #             (2, 2),
    #             (2, 3)],
    #            names=['A', 'B'])
    print(pd.MultiIndex.from_product(((1, 2), (2, 3)), names=['A', 'B']))
    # 从 DataFrame 中创建多重索引。
    # MultiIndex([(1, 3),
    #             (2, 4)],
    #            names=['A', 'B'])
    print(pd.MultiIndex.from_frame(pd.DataFrame(data={'A': [1, 2], 'B': [3, 4]})))
    # 在 MultiIndex 上设置新的级别。
    # MultiIndex([(1, 2),
    #             (2, 3)],
    #            names=['A', 'B'])
    print(multi.set_levels([[1, 2], [2, 3]]))
    # 在 MultiIndex上设置新代码。
    # MultiIndex([(1, 4),
    #             (2, 3)],
    #            names=['A', 'B'])
    print(multi.set_codes([[0, 1], [1, 0]]))
    # 创建一个以 MultiIndex 的级别作为列的 DataFrame。
    #      A  B
    # A B
    # 1 3  1  3
    # 2 4  2  4
    print(multi.to_frame())
    # 将多重索引转换为包含级别值的元组索引。
    # Index([(1, 3), (2, 4)], dtype='object')
    print(multi.to_flat_index())
    # 按照请求的级别对多索引进行排序。
    # (MultiIndex([(1, 3),
    #             (2, 4)],
    #            names=['A', 'B']), array([0, 1], dtype=int64))
    print(multi.sortlevel())
    # 返回已删除请求级别的索引。
    # Index([3, 4], dtype='int64', name='B')
    print(multi.droplevel(0))
    # 将级别 i 与级别 j 交换。
    # MultiIndex([(3, 1),
    #             (4, 2)],
    #            names=['B', 'A'])
    print(multi.swaplevel(0, 1))
    # 使用输入顺序重新排列级别。
    # MultiIndex([(3, 1),
    #             (4, 2)],
    #            names=['B', 'A'])
    print(multi.reorder_levels([1, 0]))
    # 从当前删除未使用的级别，创建新的多索引。
    # MultiIndex([(1, 3),
    #             (2, 4)],
    #            names=['A', 'B'])
    print(multi.remove_unused_levels())
    # 两个标签/元组之间的切片索引，返回新的多索引。
    # MultiIndex([(1, 3)],
    #            names=['A', 'B'])
    print(multi.truncate(0, 1))
    # 将索引选项集合附加在一起。
    # Index([(1, 3), (2, 4), (1,), (2,)], dtype='object')
    print(multi.append(pd.MultiIndex.from_arrays([[1, 2]])))
    # 获取标签或标签元组的位置。
    # slice(1, 2, None)
    print(multi.get_loc(2))
    # 返回请求级别的标签值向量。
    # Index([1, 2], dtype='int64', name='A')
    print(multi.get_level_values(0))
    # 根据当前索引计算新索引的索引器和掩码。
    # [-1 -1]
    print(multi.get_indexer([1, 2]))
    # 获取标签序列的位置。
    # [0]
    print(multi.get_locs([1]))
    # 获取请求的标签/级别的位置和切片索引。
    # (slice(1, 2, None), Index([4], dtype='int64', name='B'))
    print(multi.get_loc_level(2))
    # MultiIndex([(1, 3)],
    #            names=['A', 'B'])
    print(multi[0:1])
    # MultiIndex([(1, 3),
    #             (2, 4)],
    #            names=['A', 'B'])
    print(multi[0:2])
    # 使用已删除的传递的代码列表创建一个新的。
    # MultiIndex([(1, 3)],
    #            names=['A', 'B']) [[0, 1], [0, 1]]
    print(multi.drop(2), multi.codes)
    # MultiIndex([('a', 'c')],
    #            )
    print(pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]).drop('b'))
    # DataFrame索引为MultiIndex时截取数据
    df = pd.DataFrame(data={'A': [11, 22, 33, 44, 55, 66], 'B': [111, 222, 333, 444, 555, 666]})
    multi_idx = pd.MultiIndex.from_arrays([['1', '1', '2', '2', '2', '2'], ['a', 'b', 'c', 'd', 'e', 'f']])
    df.index = multi_idx
    #       A    B
    # 1 a  11  111
    #   b  22  222
    # 2 c  33  333
    #   d  44  444
    #   e  55  555
    #   f  66  666
    print(df)
    # 使用单个索引标签取值，只能使用multiIndex索引的level=0的标签值，返回值为DataFrame
    #     A    B
    # a  11  111
    # b  22  222
    print(df.loc['1'])
    # 使用索引元组取值，返回1个Series
    # A     11
    # B    111
    # Name: (1, a), dtype: int64
    print(df.loc[('1', 'a')])
    # 使用索引每个level的1个值取值，类似与索引元组，返回Series
    # A     11
    # B    111
    # Name: (1, a), dtype: int64
    print(df.loc['1', 'a'])
    # 使用索引元组 + [[]]取值，返回的是DataFrame
    #       A    B
    # 1 a  11  111
    print(df.loc[[('1', 'a')]])
    # 使用索引元组取值，并且只取1个列，返回该列该位置的值
    # 11
    print(df.loc[('1', 'a'), 'A'])
    # 使用索引元组取值，选取多个列，返回Series
    # A     11
    # B    111
    # Name: (1, a), dtype: int64
    print(df.loc[('1', 'a'), ['A', 'B']])
    # 索引切片取值，其中1个索引只使用level=0的值
    #       A    B
    # 1 a  11  111
    #   b  22  222
    print(df.loc[('1', 'a'): '1'])
    # 索引切片取值，其中1个索引只使用level=0的值
    #       A    B
    # 1 a  11  111
    #   b  22  222
    print(df.loc['1':('1', 'b')])
    # 使用两个元组索引值切片
    #       A    B
    # 1 a  11  111
    #   b  22  222
    print(df.loc[('1', 'a'): ('1', 'b')])

六. DatetimeIndex

def datetime_index():
    import datetime
    # 类似于 datetime64 数据的不可变 ndarray。使用时间字符串或者datetime、date等构建的日期时间索引。
    index = pd.DatetimeIndex(["1/1/2020 10:01:11+00:00", "3/1/2020 11:02:22+00:00", "2/1/2020 20:03:33+00:00"])
    # DatetimeIndex(['2020-01-01 10:01:11+00:00', '2020-03-01 11:02:22+00:00',
    #                '2020-02-01 20:03:33+00:00'],
    #               dtype='datetime64[ns, UTC]', freq=None)
    print(index)
    # [Timestamp('2020-01-01 10:01:11+0000', tz='UTC'), Timestamp('2020-03-01 11:02:22+0000', tz='UTC'),
    # Timestamp('2020-02-01 20:03:33+0000', tz='UTC')]
    print(index.tolist())
    # 使用日期字符串也可以创建DatetimeIndex
    index2 = pd.DatetimeIndex(["2023-01-01", "2024-01-01", "2024-01-02"])
    # DatetimeIndex(['2023-01-01', '2024-01-01', '2024-01-02'], dtype='datetime64[ns]', freq=None)
    print(index2)
    # 使用datetime.date对象也可以创建DatetimeIndex
    index3 = pd.DatetimeIndex([datetime.date.today(), datetime.date.today() + datetime.timedelta(1)])
    # DatetimeIndex(['2024-12-19', '2024-12-20'], dtype='datetime64[ns]', freq=None)
    print(index3)
    # 使用时间字串创建时，会自动补充日期为今天
    index4 = pd.DatetimeIndex(['10:00:00', '11:00:00', '20:00:00', '20:30:00'])
    # DatetimeIndex(['2024-12-19 10:00:00', '2024-12-19 11:00:00',
    #                '2024-12-19 20:00:00', '2024-12-19 20:30:00'],
    #               dtype='datetime64[ns]', freq=None)
    print(index4)
    # 年份组成的索引。
    print(index2.year)                  # Index([2023, 2024, 2024], dtype='int32')
    # 月份组成的索引。
    print(index.month)                  # Index([1, 3, 2], dtype='int32')
    # 当月的第几天组成的索引。
    print(index.day)                    # Index([1, 1, 1], dtype='int32')
    # 小时数组成的索引
    print(index.hour)                   # Index([10, 11, 20], dtype='int32')
    # 分钟数
    print(index.minute)                 # Index([1, 2, 3], dtype='int32')
    # 秒数
    print(index.second)                 # Index([11, 22, 33], dtype='int32')
    # 毫秒数
    print(index.microsecond)            # Index([0, 0, 0], dtype='int32')
    # 纳秒
    print(index.nanosecond)             # Index([0, 0, 0], dtype='int32')
    # 返回 pythondatetime.date对象的 numpy 数组。
    print(index.date)                 # [datetime.date(2020, 1, 1) datetime.date(2020, 3, 1) datetime.date(2020, 2, 1)]
    # 返回datetime.time对象的 numpy 数组。
    print(index.time)                   # [datetime.time(10, 1, 11) datetime.time(11, 2, 22) datetime.time(20, 3, 33)]
    # 返回带有时区的对象 numpy 数组datetime.time。
    # [datetime.time(10, 1, 11, tzinfo=datetime.timezone.utc)
    #  datetime.time(11, 2, 22, tzinfo=datetime.timezone.utc)
    #  datetime.time(20, 3, 33, tzinfo=datetime.timezone.utc)]
    print(index.timetz)
    # 一年中的序数日。
    print(index.dayofyear)              # Index([1, 61, 32], dtype='int32')
    # 一年中的序数日。
    print(index.day_of_year)            # Index([1, 61, 32], dtype='int32')
    # 星期几，星期一=0，星期日=6。
    print(index.dayofweek)              # Index([2, 6, 5], dtype='int32')
    # 星期几，星期一=0，星期日=6。
    print(index.day_of_week)            # Index([2, 6, 5], dtype='int32')
    # 星期几，星期一=0，星期日=6。
    print(index.weekday)                # Index([2, 6, 5], dtype='int32')
    # 季度
    print(index.quarter)                # Index([1, 1, 1], dtype='int32')
    # 时区
    print(index.tz)                     # UTC
    # 指示该日期是否是该月的第一天。
    print(index.is_month_start)         # [ True  True  True]
    # 指示该日期是否是该月的最后一天。
    print(index.is_month_end)           # [False False False]
    # 指示该日期是否为某个季度的第一天。
    print(index.is_quarter_start)       # [ True False False]
    # 指示该日期是否为某个季度的最后一天。
    print(index.is_quarter_end)         # [False False False]
    # 指示该日期是否是一年的第一天。
    print(index.is_year_start)          # [ True False False]
    # 指示该日期是否是一年中的最后一天。
    print(index.is_year_end)            # [False False False]
    # 布尔指示该日期是否属于闰年。
    print(index.is_leap_year)           # [ True  True  True]
    # 尝试返回表示由 infer_freq 生成的频率的字符串。
    print(index.inferred_freq)          # None
    # 返回一天中特定时间的值的索引位置。
    print(index.indexer_at_time('10:01:11'))        # [0]
    # 返回一天中特定时间之间的值的索引位置。
    print(index.indexer_between_time('10:00:00', '13:00:00'))       # [0 1]
    # 将所有元素的时间部分转换成00：00:00
    # DatetimeIndex(['2020-01-01 00:00:00+00:00', '2020-03-01 00:00:00+00:00',
    #                '2020-02-01 00:00:00+00:00'],
    #               dtype='datetime64[ns, UTC]', freq=None)
    print(index.normalize())
    # 按照传入的date_format参数将所有日期时间转换形成一个字符串索引
    # Index(['2020-01-01 10:01:11', '2020-03-01 11:02:22', '2020-02-01 20:03:33'], dtype='object')
    print(index.strftime("%Y-%m-%d %H:%M:%S"))
 
    # 将时间戳捕捉到最近发生的频率。
    # 可选项有: Y、YS、YE、M、MS、ME、D、QS、QE、W、B、min、S、s等
    index = pd.DatetimeIndex(["1/3/2020 10:01:11+00:00", "3/3/2020 11:02:22+00:00", "2/2/2020 20:03:33+00:00"])
    # DatetimeIndex(['2019-12-31 10:01:11+00:00', '2019-12-31 11:02:22+00:00',
    #                '2019-12-31 20:03:33+00:00'],
    #               dtype='datetime64[ns, UTC]', freq=None)
    print(index.snap("Y"))
    print(index.snap("YS"))
    print(index.snap("YE"))
    print(index.snap("M"))
    print(index.snap("MS"))
    print(index.snap("ME"))
    print(index.snap("D"))
    print(index.snap("Q"))
    print(index.snap("QS"))
    print(index.snap("QE"))
    print(index.snap("W"))
    print(index.snap("B"))
    print(index.snap("min"))
    print(index.snap("S"))
    print(index.snap("s"))
 
    # 将支持 tz 的日期时间数组/索引从一个时区转换为另一个时区。
    # DatetimeIndex(['2020-01-03 19:01:11+09:00', '2020-03-03 20:02:22+09:00',
    #                '2020-02-03 05:03:33+09:00'],
    #               dtype='datetime64[ns, Asia/Tokyo]', freq=None)
    print(index.tz_convert("Asia/Tokyo"))
 
    # 将 tz-naive 日期时间数组/索引本地化为 tz-aware 日期时间数组/索引。
    tz_naive = pd.date_range('2018-03-01 09:00', periods=3)
    # DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00',
    #                '2018-03-03 09:00:00'],
    #               dtype='datetime64[ns]', freq='D')
    print(tz_naive)
    # DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00',
    #                '2018-03-03 09:00:00-05:00'],
    #               dtype='datetime64[ns, US/Eastern]', freq=None)
    print(tz_naive.tz_localize('US/Eastern'))
    # 使用tz=None实现本地化。
    # DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00',
    #                '2018-03-03 09:00:00'],
    #               dtype='datetime64[ns]', freq=None)
    print(tz_naive.tz_localize('US/Eastern').tz_localize(None))
 
    # 对数据执行舍入运算到指定的频率。
    # DatetimeIndex(['2020-01-03 00:00:00+00:00', '2020-03-03 00:00:00+00:00',
    #                '2020-02-03 00:00:00+00:00'],
    #               dtype='datetime64[ns, UTC]', freq=None)
    print(index.round('d'))
    # 对数据执行 floor 操作，直到达到指定的频率。
    # DatetimeIndex(['2020-01-03 00:00:00+00:00', '2020-03-03 00:00:00+00:00',
    #                '2020-02-02 00:00:00+00:00'],
    #               dtype='datetime64[ns, UTC]', freq=None)
    print(index.floor('d'))
    # 对数据执行 ceil 运算，直到达到指定的频率。
    # DatetimeIndex(['2020-01-04 00:00:00+00:00', '2020-03-04 00:00:00+00:00',
    #                '2020-02-03 00:00:00+00:00'],
    #               dtype='datetime64[ns, UTC]', freq=None)
    print(index.ceil('d'))
 
    # 返回具有指定语言环境的月份名称。默认是英语
    print(index.month_name())  # Index(['January', 'March', 'February'], dtype='object')
 
    # 返回具有指定语言环境的星期名称。默认是英语
    print(index.day_name())  # Index(['Friday', 'Tuesday', 'Sunday'], dtype='object')
 
    index = pd.DatetimeIndex(["1/3/2020 10:01:11.001", "3/3/2020 11:02:22.002", "2/2/2020 20:03:33.003"])
    # DatetimeIndex(['2020-01-03 10:01:11.001000', '2020-03-03 11:02:22.002000',
    #                '2020-02-02 20:03:33.003000'],
    #               dtype='datetime64[ns]', freq=None)
    print(index)
    # 转换为具有给定单位分辨率的数据类型。
    # DatetimeIndex(['2020-01-03 10:01:11', '2020-03-03 11:02:22',
    #                '2020-02-02 20:03:33'],
    #               dtype='datetime64[s]', freq=None)
    print(index.as_unit('s'))
 
    # 以特定频率转换为PeriodArray/PeriodIndex。
    print(index.to_period('d'))  # PeriodIndex(['2020-01-03', '2020-03-03', '2020-02-02'], dtype='period[D]')
 
    # 返回一个datetime.datetime对象的 ndarray。
    # [datetime.datetime(2020, 1, 3, 10, 1, 11, 1000)
    #  datetime.datetime(2020, 3, 3, 11, 2, 22, 2000)
    #  datetime.datetime(2020, 2, 2, 20, 3, 33, 3000)]
    print(index.to_pydatetime())
    print(type(index.to_pydatetime()))   # <class 'numpy.ndarray'>
 
    # 创建一个索引和值都等于索引键的Series对象。
    # 2020-01-03 10:01:11.001   2020-01-03 10:01:11.001
    # 2020-03-03 11:02:22.002   2020-03-03 11:02:22.002
    # 2020-02-02 20:03:33.003   2020-02-02 20:03:33.003
    # dtype: datetime64[ns]
    print(index.to_series())
 
    # 创建一个包含索引的列的 DataFrame。
    #                                            test
    # 2020-01-03 10:01:11.001 2020-01-03 10:01:11.001
    # 2020-03-03 11:02:22.002 2020-03-03 11:02:22.002
    # 2020-02-02 20:03:33.003 2020-02-02 20:03:33.003
    print(index.to_frame(name='test'))
 
    # 返回数组的平均值。
    print(index.mean())    #  2020-02-02 13:42:22.001999872
 
    # 返回请求轴上的样本标准差。
    print(index.std())     # 30 days 00:31:51.123317254

七. TimedeltaIndex

def timedelta_index():
    # timedelta64 数据的不可变索引。
    # TimedeltaIndex(['0 days', '1 days', '2 days'], dtype='timedelta64[ns]', freq=None)
    index = pd.TimedeltaIndex(['0 days', '1 days', '2 days'])
    print(index)
    # 每个元素的天数。
    print(index.days)                   # Index([0, 1, 2], dtype='int64')
    # 每个元素的秒数（>= 0 且小于 1 天）。
    print(index.seconds)                # Index([0, 0, 0], dtype='int32')
    # 每个元素的微秒数（>= 0 且小于 1 秒）。
    print(index.microseconds)           # Index([0, 0, 0], dtype='int32')
    # 每个元素的纳秒数（>= 0 且小于 1 微秒）。
    print(index.nanoseconds)            # Index([0, 0, 0], dtype='int32')
    # 返回 Timedeltas 的各个分辨率组件的 DataFrame。
    #    days  hours  minutes  seconds  milliseconds  microseconds  nanoseconds
    # 0     0      0        0        0             0             0            0
    # 1     1      0        0        0             0             0            0
    # 2     2      0        0        0             0             0            0
    print(index.components)
    # 返回表示由 infer_freq 生成的频率的字符串。
    print(index.inferred_freq)          # D
    # 转换为具有给定单位分辨率的数据类型。
    # TimedeltaIndex(['0 days', '1 days', '2 days'], dtype='timedelta64[s]', freq=None)
    print(index.as_unit('s'))
    # 返回 datetime.timedelta 对象的 ndarray。
    # [datetime.timedelta(0) datetime.timedelta(days=1)
    #  datetime.timedelta(days=2)]
    print(index.to_pytimedelta())
    # 创建一个索引和值都等于索引键的Series。
    # 0 days   0 days
    # 1 days   1 days
    # 2 days   2 days
    # dtype: timedelta64[ns]
    print(index.to_series())
 
    index = pd.TimedeltaIndex(['0 days 1 hours', '1 days 1 hours', '2 days 1 hours'])
    # 对数据执行舍入运算到指定的频率。
    # TimedeltaIndex(['0 days', '1 days', '2 days'], dtype='timedelta64[ns]', freq=None)
    print(index.round('D'))
    # 对数据执行 floor 操作，直到达到指定的频率。
    # TimedeltaIndex(['0 days', '1 days', '2 days'], dtype='timedelta64[ns]', freq=None)
    print(index.floor('D'))
    # 对数据执行 ceil 运算，直到达到指定的频率。
    # TimedeltaIndex(['1 days', '2 days', '3 days'], dtype='timedelta64[ns]', freq=None)
    print(index.ceil('D'))
    # 创建一个包含索引的列的 DataFrame。
    #                            test
    # 0 days 01:00:00 0 days 01:00:00
    # 1 days 01:00:00 1 days 01:00:00
    # 2 days 01:00:00 2 days 01:00:00
    print(index.to_frame(name='test'))
    # 返回平均值。
    # 1 days 01:00:00
    print(index.mean())

八.PeriodIndex

def period_index():
    # 不可变的 ndarray保存表示时间周期的序数值。
    index = pd.PeriodIndex.from_fields(year=[2000, 2002], quarter=[1, 3], freq='Q')
    print(index)                # PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]')
    # 每个元素的天数
    print(index.day)            # Index([31, 30], dtype='int64')
    # 星期几，星期一=0，星期日=6。
    print(index.dayofweek)      # Index([4, 0], dtype='int64')
    # 星期几，星期一=0，星期日=6。
    print(index.day_of_week)    # Index([4, 0], dtype='int64')
    # 一年中的序数日。
    print(index.dayofyear)      # Index([91, 273], dtype='int64')
    # 一年中的序数日。
    print(index.day_of_year)    # Index([91, 273], dtype='int64')
    # 该月的天数。
    print(index.days_in_month)  # Index([31, 30], dtype='int64')
    # 该月的天数。
    print(index.daysinmonth)    # Index([31, 30], dtype='int64')
    # 获取该期间结束的时间戳。
    # DatetimeIndex(['2000-03-31 23:59:59.999999999', '2002-09-30 23:59:59.999999999'],
    # dtype='datetime64[ns]', freq=None)
    print(index.end_time)
    print(index.freq)           # <QuarterEnd: startingMonth=12>
    # 如果已设置，则将频率对象作为字符串返回，否则返回 None。
    print(index.freqstr)        # Q-DEC
    # 小时。
    print(index.hour)           # Index([0, 0], dtype='int64')
    # 逻辑表明该日期是否属于闰年。
    print(index.is_leap_year)   # [ True False]
    # 该时间段的分钟。
    print(index.minute)         # Index([0, 0], dtype='int64')
    # 月份、为一月=1，十二月=12。
    print(index.month)          # Index([3, 9], dtype='int64')
    # 该日期的季度。
    print(index.quarter)        # Index([1, 3], dtype='int64')
    # 年份
    print(index.qyear)          # Index([2000, 2002], dtype='int64')
    # 秒
    print(index.second)         # Index([0, 0], dtype='int64')
    # 获取该区间开始的时间戳。
    print(index.start_time)     # DatetimeIndex(['2000-01-01', '2002-07-01'], dtype='datetime64[ns]', freq=None)
    # 一年中第几周。
    print(index.week)           # Index([13, 40], dtype='int64')
    # 星期几，星期一=0，星期日=6。
    print(index.weekday)        # Index([4, 0], dtype='int64')
    # 一年中第几周。
    print(index.weekofyear)     # Index([13, 40], dtype='int64')
    # 区间元素的年份。
    print(index.year)           # Index([2000, 2002], dtype='int64')
 
    # 将PeriodArray 转换为指定频率freq的PeriodIndex。
    print(index.asfreq('d'))    # PeriodIndex(['2000-03-31', '2002-09-30'], dtype='period[D]')
 
    # 使用指定的 date_format 将PeriodIndex转换为Index。
    print(index.strftime("%Y-%m-%d"))   # Index(['2000-03-31', '2002-09-30'], dtype='object')
 
    # DatetimeIndex(['2000-01-01', '2002-07-01'], dtype='datetime64[ns]', freq=None)
    print(index.to_timestamp())
 
    # PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]')
    print(pd.PeriodIndex.from_fields(year=[2000, 2002], quarter=[1, 3], freq='Q'))
 
    # 可以根据数字从1970-01-01日开始创建索引
    # PeriodIndex(['1970-01-01', '1970-01-02', '1970-01-03', '1970-01-04'], dtype='period[D]')
    print(pd.PeriodIndex.from_ordinals([0, 1, 2, 3], freq='d'))

posted @ 2025-01-01 21:57 桑胡阅读(214) 评论(0) 收藏举报

刷新页面返回顶部

桑胡

Pandas：Index使用练习

一. Index

1. 创建索引以及索引属性

2. 修改和计算函数

3. multiIndex兼容性函数

4.缺失值相关函数

5.转换

6.排序

7.时间操作

8.组合、连接、设置操作

9. 数据选择

10. Index对象的其他函数

二.RangeIndex索引

三.CategoricalIndex 分类索引

四.IntervalIndex 区间索引

五. MultiIndex

六. DatetimeIndex

七. TimedeltaIndex

八.PeriodIndex

公告