提取单词
一、背景
在工作中,会碰到大量的英语单词,如果想要记住单词,一个个抄下来太慢了。
可以用Python来进行分词,还原单词,识别字符串是否为单词。
1.1 准备工作
excel表格。

二、代码
2.1 提取excel
import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.stem import PorterStemmer from nltk.corpus import wordnet from nltk.corpus import words from googletrans import Translator import pandas as pd import warnings warnings.filterwarnings('ignore') # 下载必要的数据包 nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') df = pd.read_excel(r'E:\英语\工作单词.xlsx',sheet_name='wbwdi') df_stop = pd.read_excel(r'E:\英语\stop_word.xlsx',sheet_name='Sheet1', names=['stop_word']) # 自己准备的停用词 result_word_set = set() # 将英语翻译为汉语,如果无法翻译为汉语,则认为该单词不是英语 def get_translation(word): translator = Translator() translation = translator.translate(word, src='en', dest='zh-cn') translated_word = translation.text return translated_word for index,row in df.iterrows(): text = row['WBWDI'] tokens = word_tokenize(text) # 去除停用词 stop_words = set(stopwords.words('english')) add_stop_words =set(df_stop['stop_word']) stop_words.update(add_stop_words) filtered_tokens = [word for word in tokens if word.lower() not in stop_words] # 词形还原 lemmatizer = WordNetLemmatizer() lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens if wordnet.synsets(word) ] # print(lemmatized_tokens) # 词干提取 ps = PorterStemmer() # word_stem = [ps.stem(w) for w in lemmatized_tokens if len(list(w))>=3 and w in words.words()] word_stem = [ps.stem(w) for w in lemmatized_tokens] word_stem = [w for w in lemmatized_tokens if len(list(w))>=3 and w in words.words()] #判断字符串是否是单词 result_word_set.update(set(word_stem)) result_word_list = list(sorted(result_word_set)) result_word_dict = {'WBWDI':result_word_list} df_result = pd.DataFrame(data=result_word_dict) df_result['translation'] = df_result['WBWDI'].map(get_translation) df_result.dropna(subset=['translation'], inplace=True) df_result.to_excel(excel_writer=r'E:\英语\word.xlsx', index_label='index') df_result.to_csv(path_or_buf=r'E:\英语\wbwdi.txt', index=False, header=False, columns=['WBWDI'])
2.2 提取PDF
import pdfplumber import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.stem import PorterStemmer from nltk.corpus import wordnet from nltk.corpus import words from googletrans import Translator import pandas as pd import warnings warnings.filterwarnings('ignore') import re pdf_path = r'E:\英语\EN_Hyperimperialism_RGB_240224.pdf' column_name = 'Hyperimperialism' result_word_set = set() with pdfplumber.open(pdf_path) as pdf_file: content = '' for i in range(len(pdf_file.pages)): # print("当前第 %s 页" % i) page_text = pdf_file.pages[i] page_content = page_text.extract_text() if page_content: content = content + page_content str_list = re.findall(r'[A-Za-z]+', content) str_list = list(set(str_list)) # 词形还原 lemmatizer = WordNetLemmatizer() lemmatized_tokens = [lemmatizer.lemmatize(word) for word in str_list if wordnet.synsets(word)] # 词干提取 ps = PorterStemmer() # word_stem = [ps.stem(w) for w in lemmatized_tokens if len(list(w))>=3 and w in words.words()] word_stem = [ps.stem(w) for w in lemmatized_tokens] word_stem = [w for w in lemmatized_tokens if len(list(w)) >= 3 and w in words.words()] # 判断字符串是否是单词 result_word_set.update(set(word_stem)) def get_translation(word): translator = Translator() translation = translator.translate(word, src='en', dest='zh-cn') translated_word = translation.text return translated_word result_word_list = list(sorted(result_word_set)) result_word_dict = {column_name:result_word_list} df_result = pd.DataFrame(data=result_word_dict) df_result['translation'] = df_result[column_name].map(get_translation) df_result.dropna(subset=['translation'], inplace=True) df_result.to_excel(excel_writer=r'E:\英语\word.xlsx', index_label='index') df_result.to_csv(path_or_buf='E:\\英语\\' + column_name + '.txt', index=False, header=False, columns=[column_name])
三、结果展示


四、选择一个英语软件
将处理之后的单词上传到可以背单词的软件上,我用的是扇贝。

浙公网安备 33010602011771号