【502】gensim实现Word2Vec
参考:Word Embedding Tutorial: word2vec using Gensim [EXAMPLE]
参考:NLP入门(三)词形还原(Lemmatization)
参考:Implementing Word2Vec with Gensim Library in Python
文本预处理
- 分词
- 单词转化为小写字母
- 去除单词中的标点符号
- 去除单词中的数字
- 去除空字符
- 去掉停用词
- 去掉空的list
- 词形还原
首先导入必要的 libraries
import gensim
import nltk
from gensim.models import Word2Vec
# 停用词
from nltk.corpus import stopwords
stop = stopwords.words('english')
# 标点符号
import string
# string.punctuation
# 词形还原
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
加载数据并显示
data = [{"tag": "welcome",
"patterns": ["Hi", "How are you", "Is any one to talk?", "Hello", "hi are you available"],
"responses": ["Hello, thanks for contacting us", "Good to see you here"," Hi there, how may I assist you?"]
},
{"tag": "goodbye",
"patterns": ["Bye", "See you later", "Goodbye", "I will come back soon"],
"responses": ["See you later, thanks for visiting", "have a great day ahead", "Wish you Come back again soon."]
},
{"tag": "thankful",
"patterns": ["Thanks for helping me", "Thank your guidance", "That's helpful and kind from you"],
"responses": ["Happy to help!", "Any time!", "My pleasure", "It is my duty to help you"]
},
{"tag": "hoursopening",
"patterns": ["What hours are you open?", "Tell your opening time?", "When are you open?", "Just your timing please"],
"responses": ["We're open every day 8am-7pm", "Our office hours are 8am-7pm every day", "We open office at 8 am and close at 7 pm"]
},
{"tag": "payments",
"patterns": ["Can I pay using credit card?", " Can I pay using Mastercard?", " Can I pay using cash only?" ],
"responses": ["We accept VISA, Mastercard and credit card", "We accept credit card, debit cards and cash. Please don’t worry"]
}
]
bigger_list = []
for i in range(len(data)):
for s in data[i]['patterns']:
li = s.split(" ")
bigger_list.append(li)
bigger_list
输出结果如下:
[['Hi'], ['How', 'are', 'you'], ['Is', 'any', 'one', 'to', 'talk?'], ['Hello'], ['hi', 'are', 'you', 'available'], ['Bye'], ['See', 'you', 'later'], ['Goodbye'], ['I', 'will', 'come', 'back', 'soon'], ['Thanks', 'for', 'helping', 'me'], ['Thank', 'your', 'guidance'], ["That's", 'helpful', 'and', 'kind', 'from', 'you'], ['What', 'hours', 'are', 'you', 'open?'], ['Tell', 'your', 'opening', 'time?'], ['When', 'are', 'you', 'open?'], ['Just', 'your', 'timing', 'please'], ['Can', 'I', 'pay', 'using', 'credit', 'card?'], ['', 'Can', 'I', 'pay', 'using', 'Mastercard?'], ['', 'Can', 'I', 'pay', 'using', 'cash', 'only?']]
将单词都转换为小写字母:
# 将单词变为小写 bigger_list = [[w.lower() for w in s] for s in bigger_list] bigger_list
输出结果如下:
[['hi'], ['how', 'are', 'you'], ['is', 'any', 'one', 'to', 'talk?'], ['hello'], ['hi', 'are', 'you', 'available'], ['bye'], ['see', 'you', 'later'], ['goodbye'], ['i', 'will', 'come', 'back', 'soon'], ['thanks', 'for', 'helping', 'me'], ['thank', 'your', 'guidance'], ["that's", 'helpful', 'and', 'kind', 'from', 'you'], ['what', 'hours', 'are', 'you', 'open?'], ['tell', 'your', 'opening', 'time?'], ['when', 'are', 'you', 'open?'], ['just', 'your', 'timing', 'please'], ['can', 'i', 'pay', 'using', 'credit', 'card?'], ['', 'can', 'i', 'pay', 'using', 'mastercard?'], ['', 'can', 'i', 'pay', 'using', 'cash', 'only?']]
删除单词里面的标点符号
import string # 存储标点符号为一个字符串 # string.punctuation # 去掉单词中的标点 # ''.join([x for x in 'alex?' if x not in string.punctuation]) # 输出为 alex # 去掉单词中的标点 bigger_list = [[''.join([x for x in w if x not in string.punctuation]) for w in s] for s in bigger_list] bigger_list
输出结果如下:
[['hi'], ['how', 'are', 'you'], ['is', 'any', 'one', 'to', 'talk'], ['hello'], ['hi', 'are', 'you', 'available'], ['bye'], ['see', 'you', 'later'], ['goodbye'], ['i', 'will', 'come', 'back', 'soon'], ['thanks', 'for', 'helping', 'me'], ['thank', 'your', 'guidance'], ['thats', 'helpful', 'and', 'kind', 'from', 'you'], ['what', 'hours', 'are', 'you', 'open'], ['tell', 'your', 'opening', 'time'], ['when', 'are', 'you', 'open'], ['just', 'your', 'timing', 'please'], ['can', 'i', 'pay', 'using', 'credit', 'card'], ['', 'can', 'i', 'pay', 'using', 'mastercard'], ['', 'can', 'i', 'pay', 'using', 'cash', 'only']]
去掉空字符
# 去掉空字符 bigger_list = [[w for w in s if w!=''] for s in bigger_list] bigger_list
输出结果如下:
[['hi'], ['how', 'are', 'you'], ['is', 'any', 'one', 'to', 'talk'], ['hello'], ['hi', 'are', 'you', 'available'], ['bye'], ['see', 'you', 'later'], ['goodbye'], ['i', 'will', 'come', 'back', 'soon'], ['thanks', 'for', 'helping', 'me'], ['thank', 'your', 'guidance'], ['thats', 'helpful', 'and', 'kind', 'from', 'you'], ['what', 'hours', 'are', 'you', 'open'], ['tell', 'your', 'opening', 'time'], ['when', 'are', 'you', 'open'], ['just', 'your', 'timing', 'please'], ['can', 'i', 'pay', 'using', 'credit', 'card'], ['can', 'i', 'pay', 'using', 'mastercard'], ['can', 'i', 'pay', 'using', 'cash', 'only']]
去掉停用词
from nltk.corpus import stopwords
# 存储停用词
stop = stopwords.words('english')
# 去掉停用词
bigger_list = [[w for w in s if w not in stop] for s in bigger_list]
bigger_list
输出结果如下:
[['hi'], [], ['one', 'talk'], ['hello'], ['hi', 'available'], ['bye'], ['see', 'later'], ['goodbye'], ['come', 'back', 'soon'], ['thanks', 'helping'], ['thank', 'guidance'], ['thats', 'helpful', 'kind'], ['hours', 'open'], ['tell', 'opening', 'time'], ['open'], ['timing', 'please'], ['pay', 'using', 'credit', 'card'], ['pay', 'using', 'mastercard'], ['pay', 'using', 'cash']]
去掉空的 list
# 去掉空的list bigger_list = [s for s in bigger_list if len(s) > 0] bigger_list
输出结果如下:
[['hi'], ['one', 'talk'], ['hello'], ['hi', 'available'], ['bye'], ['see', 'later'], ['goodbye'], ['come', 'back', 'soon'], ['thanks', 'helping'], ['thank', 'guidance'], ['thats', 'helpful', 'kind'], ['hours', 'open'], ['tell', 'opening', 'time'], ['open'], ['timing', 'please'], ['pay', 'using', 'credit', 'card'], ['pay', 'using', 'mastercard'], ['pay', 'using', 'cash']]
词形还原
# 词形还原 from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() bigger_list = [[wnl.lemmatize(w) for w in s] for s in bigger_list] bigger_list
输出结果如下:
[['hi'], ['one', 'talk'], ['hello'], ['hi', 'available'], ['bye'], ['see', 'later'], ['goodbye'], ['come', 'back', 'soon'], ['thanks', 'helping'], ['thank', 'guidance'], ['thats', 'helpful', 'kind'], ['hour', 'open'], ['tell', 'opening', 'time'], ['open'], ['timing', 'please'], ['pay', 'using', 'credit', 'card'], ['pay', 'using', 'mastercard'], ['pay', 'using', 'cash']]
模型训练并存储以及调用
# 训练模型
model= Word2Vec(bigger_list,min_count=1,size=300,workers=4)
# 模型存储
model.save("word2vec.model")
model.save('word2vec.bin')
# 模型加载
model = Word2Vec.load('word2vec.bin')
# 词汇
list(model.wv.vocab)
# thanks 对应的 vector
model.wv.word_vec('thanks')
word2vec API讲解
在gensim中,word2vec 相关的API都在包gensim.models.word2vec中。和算法有关的参数都在类gensim.models.word2vec.Word2Vec中。算法需要注意的参数有:
- sentences:我们要分析的语料,可以是一个列表,或者从文件中遍历读出(word2vec.LineSentence(filename) )。
- size:词向量的维度,默认值是100。这个维度的取值一般与我们的语料的大小相关,如果是不大的语料,比如小于100M的文本语料,则使用默认值一般就可以了。如果是超大的语料,建议增大维度。
- window:即词向量上下文最大距离,window越大,则和某一词较远的词也会产生上下文关系。默认值为5,在实际使用中,可以根据实际的需求来动态调整这个window的大小。如果是小语料则这个值可以设的更小。对于一般的语料这个值推荐在[5;10]之间。
- sg:即我们的word2vec两个模型的选择了。如果是0, 则是CBOW模型;是1则是Skip-Gram模型;默认是0即CBOW模型。
- hs:即我们的word2vec两个解法的选择了。如果是0, 则是Negative Sampling;是1的话并且负采样个数negative大于0, 则是Hierarchical Softmax。默认是0即Negative Sampling。
- negative:即使用Negative Sampling时负采样的个数,默认是5。推荐在[3,10]之间。这个参数在我们的算法原理篇中标记为neg。
- cbow_mean:仅用于CBOW在做投影的时候,为0,则算法中的xw为上下文的词向量之和,为1则为上下文的词向量的平均值。在我们的原理篇中,是按照词向量的平均值来描述的。个人比较喜欢用平均值来表示xw,默认值也是1,不推荐修改默认值。
- min_count:需要计算词向量的最小词频。这个值可以去掉一些很生僻的低频词,默认是5。如果是小语料,可以调低这个值。
- iter:随机梯度下降法中迭代的最大次数,默认是5。对于大语料,可以增大这个值。
- alpha:在随机梯度下降法中迭代的初始步长。算法原理篇中标记为η,默认是0.025。
- min_alpha: 由于算法支持在迭代的过程中逐渐减小步长,min_alpha给出了最小的迭代步长值。随机梯度下降中每轮的迭代步长可以由iter,alpha, min_alpha一起得出。这部分由于不是word2vec算法的核心内容,因此在原理篇我们没有提到。
利用json和pandas处理
#list of libraries used by the code
import string
from gensim.models import Word2Vec
import logging
from nltk.corpus import stopwords
from textblob import Word
import json
import pandas as pd
#data in json format
json_file = 'intents.json'
with open('intents.json','r') as f:
data = json.load(f)
#displaying the list of stopwords
stop = stopwords.words('english')
#dataframe
df = pd.DataFrame(data)
df['patterns'] = df['patterns'].apply(', '.join)
# print(df['patterns'])
#print(df['patterns'])
#cleaning the data using the NLP approach
print(df)
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x.lower() for x in x.split()))
df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))
df['patterns']= df['patterns'].str.replace('[^\w\s]','')
df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if not x.isdigit()))
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop))
df['patterns'] = df['patterns'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
#taking the outer list
bigger_list=[]
for i in df['patterns']:
li = list(i.split(" "))
bigger_list.append(li)
#structure of data to be taken by the model.word2vec
print("Data format for the overall list:",bigger_list)
#custom data is fed to machine for further processing
model = Word2Vec(bigger_list, min_count=1,size=300,workers=4)
#print(model)
浙公网安备 33010602011771号