import pandas as pd
# 读取excel文件,并保存为DataFrame
df1 = pd.read_excel(r"C:\Users\admin\Desktop\不规则动词\不规则动词_irregular verbs\Sheet1.xlsx",
names=["基本形态", "过去式", "过去分词"])
df2 = pd.read_excel(r"C:\Users\admin\Desktop\不规则动词\不规则动词_irregular verbs\Sheet2.xlsx",
names=["基本形态", "过去式", "过去分词"])
df3 = pd.read_excel(r"C:\Users\admin\Desktop\不规则动词\不规则动词_irregular verbs\Sheet3.xlsx",
names=["基本形态", "过去式", "过去分词"])
df4 = pd.read_excel(r"C:\Users\admin\Desktop\不规则动词\不规则动词_irregular verbs\Sheet4.xlsx",
names=["基本形态", "过去式", "过去分词"])
# 把4个frame合并成一个,并去重
frame = [df1, df2, df3, df4]
frame = pd.concat(frame).drop_duplicates()
# 用正则替换指定列内容
# 1. 使用regex=True
frame["过去式"].replace(r"\s?,\s?|\sor\s", " / ", regex=True, inplace=True)
# frame["过去分词"].replace("\s?,\s?|\sor\s", " / ", regex=True, inplace=True)
# 2. 也可以简单地传递一个字典,其中key是要替换的目标内容,value是要替换的内容
replacement_mapping_dict = {
r'\s?,\s?': ' / ',
r'\sor\s': ' / '
}
frame['过去分词'].replace(replacement_mapping_dict, inplace=True)
# 3. 如果有多个想要匹配的正则表达式,可以在列表中定义它们,并将其作为关键字参数传递给 replace 方法。
regex_list = [r'\s?,\s?', r'\sor\s']
frame['过去分词'].replace(regex=regex_list, value=r' / ', inplace=True)
# 根据某列去重
frame.drop_duplicates(subset=['过去式'], keep='first', inplace=True)
# 根据某列值进行排序
frame.sort_values(by=['基本形态'], ascending=False)
# 处理好的数据保存到excel文件中
frame.to_excel(r"C:\Users\admin\Desktop\不规则动词\不规则动词_irregular verbs\Sheet01.xlsx")