import pandas as pd
if __name__ == '__main__':
student = pd.read_excel("C:/Users/18124/Desktop/pandas/020_定位_消除重复数据/副本Students_Duplicates.xlsx", \
engine="openpyxl")
print(student)
# 1 删除数据 : 重复数据 - 单列匹配
# student.drop_duplicates(subset="Name", inplace=True)
# 2 删除数据 : 重复数据 - 多列匹配
# student.drop_duplicates(subset=["Name", "Test_1", "Test_2"], inplace=True)
# 3 删除数据 : 单列匹配 - 删除前面重复
student.drop_duplicates(subset="Name", inplace=True, keep="last") # keep="last" - 保留后面数据
print(student)
import pandas as pd
if __name__ == '__main__':
student = pd.read_excel("C:/Users/18124/Desktop/pandas/020_定位_消除重复数据/副本Students_Duplicates.xlsx", \
engine="openpyxl")
print(student)
# 1 查看重复项 - 是否存在重复
dupe = student.duplicated(subset="Name")
print(dupe.any()) # 判断dupe - 是否存在True
# 2 查找重复项 - 打印信息
dupe = dupe[dupe == True] # 数据过滤
print(student.iloc[dupe.index]) # 通过index定位