pd.get_dummy() 详细用法即源码解析 - XZY3031

公告

源代码分析

pandas：encoding.py get_dummy() 解析

def get_dummies(  
    data,
    prefix=None,	
    prefix_sep: str | Iterable[str] | dict[str, str] = "_",
    dummy_na: bool = False,
    columns=None,
    sparse: bool = False,
    drop_first: bool = False,
    dtype: NpDtype | None = None,
) -> DataFrame:
    """
    Convert categorical variable into dummy/indicator variables.
	将分类变量（categorical variables）转换为 虚拟变量（dummy variables）
	（注：虚拟变量是指将分类变量转换为二进制变量，每个二进制值代表分类变量的一个可能取值）
    Each variable is converted in as many 0/1 variables as there are different
    values. Columns in the output are each named after a value; if the input is
    a DataFrame, the name of the original variable is prepended to the value.
	每一个变量转换为和类别数一样多的 0/1 变量。输出的 dataframe 的每个 value 列会被命名；
	如果输入时一个DataFrame，原始变量的名字前置到字面值前。
	
    Parameters
    ----------
    data : array-like, Series, or DataFrame
        Data of which to get dummy indicators.
    prefix : str, list of str, or dict of str, default None  
        String to append DataFrame column names.
        Pass a list with length equal to the number of columns
        when calling get_dummies on a DataFrame. Alternatively, `prefix`
        can be a dictionary mapping column names to prefixes.
        当prefix为list类型时，虚拟变量的列名的前缀（和 columns 参数（list类型）的长度相同）
        当prefix为dict类型时，长度无要求
    prefix_sep : str, default '_'   前缀和原始列名的分隔符
        If appending prefix, separator/delimiter to use. Or pass a
        list or dictionary as with `prefix`.
        如果添加前缀，使用分隔符/分隔符。或者传递一个列表或字典，就像“前缀一样”
    dummy_na : bool, default False  
        Add a column to indicate NaNs, if False NaNs are ignored.
        增加一个表示 nan 的列，如果 false，就忽略类别中的 NaN 
    columns : list-like, default None
        Column names in the DataFrame to be encoded.
        If `columns` is None then all the columns with
        `object`, `string`, or `category` dtype will be converted.
        DataFrame 中编码的列名，如果'column' 是None，那么所有的`object`,
        `string`,`category` 类型的列都会被转换
    sparse : bool, default False
        Whether the dummy-encoded columns should be backed by
        a :class:`SparseArray` (True) or a regular NumPy array (False).
        `sparse` 参数在用于控制生成的虚拟编码序列是否由 `SparseArray` 或者
        一个常规的 Numpy 数组（False）
        如果将 sparse 设为 false，则生成的虚拟编码存储为常规的 numpy 数组，如果将 
        sparse 设为true，生成的虚拟编码将由 SparseArray 进行存储，减少内存的使用。
    drop_first : bool, default False
        Whether to get k-1 dummies out of k categorical levels by removing the
        first level.
        是否删除每一个虚拟变量的第一个类，用于避免多重共线性
        
        
    dtype : dtype, default bool
        Data type for new columns. Only a single dtype is allowed.
        指定生成的虚拟变量的数据类型，比如：默认 bool，则变量为 false/true
        如果使用 float，则变量为 0.0/1.0

    Returns
    -------
    DataFrame
        Dummy-coded data. If `data` contains other columns than the
        dummy-coded one(s), these will be prepended, unaltered, to the result.
        虚拟编码数据，如果 `data` 包含除虚拟编码列以外的其他列，那么这些列将会被前置
        到结果中，不会改变

    See Also
    --------
    Series.str.get_dummies : Convert Series of strings to dummy codes.
    :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.

    Notes
    -----
    Reference :ref:`the user guide <reshaping.dummies>` for more examples.

    Examples
    --------
    >>> s = pd.Series(list('abca'))

    >>> pd.get_dummies(s)
           a      b      c
    0   True  False  False
    1  False   True  False
    2  False  False   True
    3   True  False  False

    >>> s1 = ['a', 'b', np.nan]

    >>> pd.get_dummies(s1)
           a      b
    0   True  False
    1  False   True
    2  False  False

    >>> pd.get_dummies(s1, dummy_na=True)
           a      b    NaN
    0   True  False  False
    1  False   True  False
    2  False  False   True

    >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
    ...                    'C': [1, 2, 3]})

    >>> pd.get_dummies(df, prefix=['col1', 'col2']) # col1 对应A列，对于第一列，行1为 `a`，行 2 为`b`,行 3 为`a`
       C  col1_a  col1_b  col2_a  col2_b  col2_c
    0  1    True   False   False    True   False
    1  2   False    True    True   False   False
    2  3    True   False   False   False    True

    >>> pd.get_dummies(pd.Series(list('abcaa')))
           a      b      c
    0   True  False  False
    1  False   True  False
    2  False  False   True
    3   True  False  False
    4   True  False  False

    >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
           b      c
    0  False  False
    1   True  False
    2  False   True
    3  False  False
    4  False  False

    >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
         a    b    c
    0  1.0  0.0  0.0
    1  0.0  1.0  0.0
    2  0.0  0.0  1.0
    """
    from pandas.core.reshape.concat import concat

    dtypes_to_encode = ["object", "string", "category"]

    if isinstance(data, DataFrame): # 如果输入的 data 是 DataFrame 类型
        # determine columns being encoded
        # 决定哪些列需要进行编码，下面 data_to_encode 返回一个新的DataFrame
        if columns is None: 
             # 	当输入的参数 columns == None，则选择所有列（属于dtypes_to_encode中
             #  指定类型的列），data.select_dtypes 根据数据类型选择子集列，它可以帮助
             #  你从 DataFrame 中选择特定数据类型的列，例如选择所有数值列或分类列
             #  函数声明：DataFrame.select_dtypes(include=None, exclude=None)
             #  include 指名待提取的子列类型，exclude 指明排除的子列类型，一般二者
             #  不一起使用，exclude 用于在获取去除掉某些类型的子列
            data_to_encode = data.select_dtypes(include=dtypes_to_encode)
        elif not is_list_like(columns):
            # 判断输入的 colunms 参数是否是 list 类型
            raise TypeError("Input must be a list-like for parameter `columns`")
        else:
            # 确定输入的 columns 参数是 list 类型后，选择 columns 中指明的列
            data_to_encode = data[columns]

        # validate prefixes and separator to avoid silently dropping cols
        # 验证前缀和分隔符以避免 silently 删除 cols
        def check_len(item, name):
            if is_list_like(item):
                if not len(item) == data_to_encode.shape[1]:
                    len_msg = (
                        # {name} 和 {len(item)} 是占位符,分别对应一个字符串和数值
                        f"Length of '{name}' ({len(item)}) did not match the "
                        "length of the columns being encoded "
                        f"({data_to_encode.shape[1]})."
                    )
                    raise ValueError(len_msg)

        # 当 prefix 是一个 list 时，判断 prefix的长度是否和data_to_encode 函数的列数相等
        # 不相等则报错，相等或者prefix不是一个list时，函数什么也不做
        check_len(prefix, "prefix")
        check_len(prefix_sep, "prefix_sep")

        if isinstance(prefix, str):
            # 创建一个无限循环的迭代器，该迭代器中只有一种元素（prefix）
            prefix = itertools.cycle([prefix])
        if isinstance(prefix, dict):
            prefix = [prefix[col] for col in data_to_encode.columns]

        if prefix is None:
            prefix = data_to_encode.columns

        # validate separators
        # 验证分隔符
        if isinstance(prefix_sep, str):
            prefix_sep = itertools.cycle([prefix_sep])
        elif isinstance(prefix_sep, dict):
            prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
	    
        # python3.5 中引入的新特性，用于在代码中指定变量，函数参数，函数返回值等的预期
        # 数据类型，在此处，`with_dummies` 被标注为一个列表（'list'），但是，Python
        # 类型提示不是强制性的，也可以传递其他类型的数据，此提示有助于提供开发文档，静态
        # 代码分析和更好的代码理解
        with_dummies: list[DataFrame]
        if data_to_encode.shape == data.shape:
            # Encoding the entire df, do not prepend any dropped columns
            with_dummies = []
        elif columns is not None:
            # Encoding only cols specified in columns. Get all cols not in
            # columns to prepend to result.
            # 将 columns中指定的列之外的列存储到 with_dummies 中去，axis=1 表示
            # 沿着列的方向进行删除，with_dummies 中的列后面会前置到编码的结果中去
            with_dummies = [data.drop(columns, axis=1)]
        else:
            # Encoding only object and category dtype columns. Get remaining
            # columns to prepend to result.
            # 仅仅编码 object 和 category 类型的。
            with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]

        # `data_to_encode.items()` 用于迭代 DataFrame 的列及其对应的 Series,
        # 返回一个可迭代对象，其中包含该 DataFrame 中的列
        for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):
            # col is (column_name, column), use just column data here
            # col 是（column_name, column），仅仅在这里使用 column 数据，下面的 _get_dumies_1d() 
            # 函数用于具体的实现类别变量到 0/1 变量的转换
            dummy = _get_dummies_1d(
                col[1],
                prefix=pre,
                prefix_sep=sep,
                dummy_na=dummy_na,
                sparse=sparse,
                drop_first=drop_first,
                dtype=dtype,
            )
            with_dummies.append(dummy)
        result = concat(with_dummies, axis=1)
    else:
        result = _get_dummies_1d(
            data,
            prefix,
            prefix_sep,
            dummy_na,
            sparse=sparse,
            drop_first=drop_first,
            dtype=dtype,
        )
    return result

pandas：encoding.py get_dummy() 举例：

>>> train_data.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

>>> train_data['Pclass'].dtype
dtype('int64')
>>> train_data['Sex'].dtype
dtype('O')  # 'O' 即 object 的缩写
>>> train_data['SibSp'].dtype 
dtype('int64')
>>> train_data['Parch'].dtype 
dtype('int64')

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
# 只对 object, string, category 进行 dummy_decode, `Pclass`(int64)，`SibSp`(int64),`Parch`(int64)
# 都不属于这三种类型之一，只有 `sex`(object 类型) 属于这三种类型之一，新问题 Sex_female 这个名字是怎么来的
# 答：Typora 代码181，182行，若 prefix == None，则使用输入的 dataframe 的 columns 作为 prefix，此处，prefix
# 即为原有的列名：Sex，分隔符使用默认的 '_', 后缀为列的值（female, male） object 类型的值（这个值是转换后的字符串吗？）
X.head() 
# 运行结果如下，X 的行数与 train_data 的行数相同
# 	Pclass	SibSp	Parch	Sex_female	Sex_male
#	3		1		0		False		True
#	1		1		0		True		False
#	3		0		0		True		False
#	1		1		0		True		False
#	3		0		0		False		True

# 该 get_dummies 的转换的逻辑为：
# 若某列为: A :[a, b, c, c, a]
# 则转换后有三列（因为 A 的值有三种类型：a，b，c 每种类型对应一个列）
# 第一列为 A_a : [true, false, false, false, true] 
#（因为原 A 列只有第一行和第五行为a，因此 A_a 第一行和第五行为 true）,同理如下：
# 第二列为 A_b : [false, true, false, false, false]
# 第三列为 A_c : [false, false, true, true, false]
# （可以发现，若把false看作0，true看作1，三列加起来，正好为一个全 1（true） 的列）

posted on 2023-08-17 08:42 XZY3031 阅读(45) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

23-zyXian

公告

源代码分析