提取单词

一、背景

在工作中,会碰到大量的英语单词,如果想要记住单词,一个个抄下来太慢了。

可以用Python来进行分词,还原单词,识别字符串是否为单词。


1.1 准备工作

excel表格。 

 

PDF

 

二、代码

2.1 提取excel

 

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.corpus import words
from googletrans import Translator
import pandas as pd
import warnings
warnings.filterwarnings('ignore')



# 下载必要的数据包
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


df = pd.read_excel(r'E:\英语\工作单词.xlsx',sheet_name='wbwdi')
df_stop = pd.read_excel(r'E:\英语\stop_word.xlsx',sheet_name='Sheet1', names=['stop_word'])  # 自己准备的停用词
result_word_set = set()


# 将英语翻译为汉语,如果无法翻译为汉语,则认为该单词不是英语
def get_translation(word):
    translator = Translator()
    translation = translator.translate(word, src='en', dest='zh-cn')
    translated_word = translation.text
    return translated_word

for index,row in df.iterrows():
    text = row['WBWDI']
    tokens = word_tokenize(text)
    # 去除停用词
    stop_words = set(stopwords.words('english'))
    add_stop_words =set(df_stop['stop_word'])
    stop_words.update(add_stop_words)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # 词形还原
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens if wordnet.synsets(word) ]
    # print(lemmatized_tokens)

    # 词干提取
    ps = PorterStemmer()
    # word_stem = [ps.stem(w) for w in lemmatized_tokens if len(list(w))>=3 and w in words.words()]
    word_stem = [ps.stem(w) for w in lemmatized_tokens]
    word_stem = [w for w in lemmatized_tokens if len(list(w))>=3 and w in words.words()]   #判断字符串是否是单词
    result_word_set.update(set(word_stem))


result_word_list = list(sorted(result_word_set))
result_word_dict = {'WBWDI':result_word_list}

df_result = pd.DataFrame(data=result_word_dict)
df_result['translation'] = df_result['WBWDI'].map(get_translation)
df_result.dropna(subset=['translation'], inplace=True)

df_result.to_excel(excel_writer=r'E:\英语\word.xlsx', index_label='index')
df_result.to_csv(path_or_buf=r'E:\英语\wbwdi.txt', index=False, header=False, columns=['WBWDI'])

 

 

2.2 提取PDF

import pdfplumber
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.corpus import words
from googletrans import Translator
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import re


pdf_path = r'E:\英语\EN_Hyperimperialism_RGB_240224.pdf'
column_name = 'Hyperimperialism'

result_word_set = set()
with pdfplumber.open(pdf_path) as pdf_file:
    content = ''
    for i in range(len(pdf_file.pages)):
        # print("当前第 %s 页" % i)
        page_text = pdf_file.pages[i]
        page_content = page_text.extract_text()
        if page_content:
            content = content + page_content

str_list = re.findall(r'[A-Za-z]+', content)
str_list = list(set(str_list))
# 词形还原

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in str_list if wordnet.synsets(word)]

# 词干提取
ps = PorterStemmer()
# word_stem = [ps.stem(w) for w in lemmatized_tokens if len(list(w))>=3 and w in words.words()]
word_stem = [ps.stem(w) for w in lemmatized_tokens]
word_stem = [w for w in lemmatized_tokens if len(list(w)) >= 3 and w in words.words()]  # 判断字符串是否是单词
result_word_set.update(set(word_stem))


def get_translation(word):
    translator = Translator()
    translation = translator.translate(word, src='en', dest='zh-cn')
    translated_word = translation.text
    return translated_word


result_word_list = list(sorted(result_word_set))
result_word_dict = {column_name:result_word_list}

df_result = pd.DataFrame(data=result_word_dict)
df_result['translation'] = df_result[column_name].map(get_translation)
df_result.dropna(subset=['translation'], inplace=True)


df_result.to_excel(excel_writer=r'E:\英语\word.xlsx', index_label='index')
df_result.to_csv(path_or_buf='E:\\英语\\' + column_name + '.txt', index=False, header=False, columns=[column_name])

 

 

 

三、结果展示

 

 

 

 

 

四、选择一个英语软件

将处理之后的单词上传到可以背单词的软件上,我用的是扇贝。

 

posted @ 2025-03-10 11:01  qsl_你猜  阅读(53)  评论(0)    收藏  举报