DataFrame去重drop_duplicates
from pandas import DataFrame import pandas as pd frame = DataFrame({"k1": ["one"]*3+["two"]*4, "k2": [1, 1, 2, 3, 3, 4, 5]}) # 检查是否重复 frame.duplicated() # 去重,生成新的DataFrame frame.drop_duplicates() # 去重,inplace=True对原DataFrame进行操作 frame.drop_duplicates(inplace=True) # keep=False,不保留重复值 frame.drop_duplicates(keep=False) # keep='first',保留前面的值 frame.drop_duplicates(keep='first') # keep='last',保留后面的值 frame.drop_duplicates(keep='last') # 只保留重复值 pd.concat([frame.drop_duplicates(), frame.drop_duplicates(keep=False)]).drop_duplicates(keep=False) # 根据指定列去重 frame.drop_duplicates(['k1']) # 根据指定某几列去重 frame.drop_duplicates(['k1', 'k2'])
from pandas import DataFrame,Series import pandas as pd import numpy as np # 移除重复数据 data = DataFrame({"k1":["one"]*3+["two"]*4, "k2":[1,1,2,3,3,4,4]}) print(data) ''' k1 k2 0 one 1 1 one 1 2 one 2 3 two 3 4 two 3 5 two 4 6 two 4 ''' # duplicated表示各行是否重复行 print(data.duplicated()) ''' 0 False 1 True 2 False 3 False 4 True 5 False 6 True dtype: bool ''' # 移除重复行 print(data.drop_duplicates()) ''' k1 k2 0 one 1 2 one 2 3 two 3 5 two 4 ''' # 默认判断全部列,也可以指定部分列进行重复项判断 # 默认保留第一个出现的值组合,传入keep='last'则保留最后一个 print(data.drop_duplicates(["k1"])) ''' k1 k2 0 one 1 3 two 3 ''' print(data.drop_duplicates(["k1"],keep='last')) ''' k1 k2 2 one 2 6 two 4 '''
本文来自博客园,作者:OTAKU_nicole,转载请注明原文链接:https://www.cnblogs.com/nicole-zhang/p/14434303.html