DataFrame去重drop_duplicates

from pandas import DataFrame
import pandas as pd

frame = DataFrame({"k1": ["one"]*3+["two"]*4,
                   "k2": [1, 1, 2, 3, 3, 4, 5]})
# 检查是否重复
frame.duplicated()
# 去重,生成新的DataFrame
frame.drop_duplicates()
# 去重,inplace=True对原DataFrame进行操作
frame.drop_duplicates(inplace=True)
# keep=False,不保留重复值
frame.drop_duplicates(keep=False)
# keep='first',保留前面的值
frame.drop_duplicates(keep='first')
# keep='last',保留后面的值
frame.drop_duplicates(keep='last')
# 只保留重复值
pd.concat([frame.drop_duplicates(), frame.drop_duplicates(keep=False)]).drop_duplicates(keep=False)
# 根据指定列去重
frame.drop_duplicates(['k1'])
# 根据指定某几列去重
frame.drop_duplicates(['k1', 'k2'])

 

from pandas import DataFrame,Series
import pandas as pd
import numpy as np

# 移除重复数据
data = DataFrame({"k1":["one"]*3+["two"]*4,
                  "k2":[1,1,2,3,3,4,4]})
print(data)
'''
    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4
'''
# duplicated表示各行是否重复行
print(data.duplicated())
'''
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
'''
# 移除重复行
print(data.drop_duplicates())
'''
    k1  k2
0  one   1
2  one   2
3  two   3
5  two   4
'''
# 默认判断全部列,也可以指定部分列进行重复项判断
# 默认保留第一个出现的值组合,传入keep='last'则保留最后一个
print(data.drop_duplicates(["k1"]))
'''
    k1  k2
0  one   1
3  two   3
'''
print(data.drop_duplicates(["k1"],keep='last'))
'''
    k1  k2
2  one   2
6  two   4
'''

 

posted @ 2021-02-23 09:53  OTAKU_nicole  阅读(244)  评论(0编辑  收藏  举报