二阶HMM词性标注
HMM在自然语言处理中十分有用,这里介绍其在词性标注中的应用。
首先NLP中需要对数据稀疏问题进行处理,一般包括加一平滑、留存估计、GoodTuring平滑以及线性插值。本文介绍一下GoodTuring平滑方法,GoodTuring基于的方法是将数据分布看成多项式分布。根据Zif理论,设在语料库中出现r次的词在语料库中共有Nr个,那么r越大,Nr越小,并且两者的乘积大致是一个常数,那么也就是说r*Nr=(r+1)*Nr+1,所以这给我们一个启示可以通过已知事件去估计未知事件,即在语料库中出现0次的词在估计中出现N1/N0次,为了保证估计的准确性,一般只对低频的词进行重新估计。
def good_turing(counts): N = sum(counts) # 总的出现次数 prob = [0] * len(counts) if N == 0: return prob Nr = [0] * (max(counts) + 1) # 出现r次的词个数 for r in counts: Nr[r] += 1 smooth_boundary = min(len(Nr)-1, 8) # 使用good-turing方法进行平滑 for r in range(smooth_boundary): if Nr[r] != 0 and Nr[r+1] != 0: Nr[r] = (r+1) * Nr[r+1] / Nr[r] else: Nr[r] = r for r in range(smooth_boundary, len(Nr)): Nr[r] = r for i in range(len(counts)): prob[i] = Nr[counts[i]] total = sum(prob) return [p/total for p in prob] # 归一化输出
接下来进行的对数据进行训练,计算HMM的参数(发射矩阵和转移矩阵)
def train(self, wrd_path, tag_path): emission_counts = defaultdict(int) trigram_counts = defaultdict(int) tags = set() words = set() wrd_iter = sent_iterator(corpus_iterator(wrd_path)) tag_iter = sent_iterator(corpus_iterator(tag_path)) # 统计词频 for (wrd_sent, tag_sent) in zip(wrd_iter, tag_iter): for (wrd, tag) in zip(wrd_sent, tag_sent): words.add(wrd) tags.add(tag) emission_counts[(wrd, tag)] += 1 tag_boundary = 2 * ['*'] tag_boundary.extend(tag_sent) tag_boundary.append('STOP') for i in range(len(tag_boundary) - 2): trigram_counts[tuple(tag_boundary[i:i+3])] += 1 # 对词语和词性做映射 for tag in tags: self.tag2num[tag] = self.ntags self.num2tag.append(tag) self.ntags += 1 for wrd in words: self.nwords += 1 self.word2num[wrd] = self.nwords print(self.ntags, ' ', self.nwords) # 计算发射矩阵和转移矩阵 nt = self.ntags nw = self.nwords self.em_prob = [None for i in range(nt)] self.tr_prob = [[None for i in range(nt+1)] for j in range(nt+1)] # 发射矩阵 for i in range(nt): tag = self.num2tag[i] counts = [0] * (nw+1) for wrd in words: counts[self.word2num[wrd]] = emission_counts[(wrd, tag)] self.em_prob[i] = good_turing(counts) # 转移矩阵(u, v, w)或者(u, v, 'STOP') for i in range(nt): u = self.num2tag[i] for j in range(nt): v = self.num2tag[j] counts = [0] * (nt+1) for w in tags: counts[self.tag2num[w]] = trigram_counts[(u, v, w)] counts[nt] = trigram_counts[(u, v, 'STOP')] self.tr_prob[i][j] = good_turing(counts) # 转移矩阵(*, v, w) for j in range(nt): v = self.num2tag[j] counts = [0] * (nt+1) for w in tags: counts[self.tag2num[w]] = trigram_counts[('*', v, w)] counts[nt] = trigram_counts[('*', v, 'STOP')] self.tr_prob[nt][j] = good_turing(counts) # 转移矩阵(*, *, w) counts = [0] * nt for w in tags: counts[self.tag2num[w]] = trigram_counts[('*', '*', w)] self.tr_prob[nt][nt] = good_turing(counts)
最后利用发射矩阵和转移矩阵预计新句子的词性,使用算法就是经典的Viterbi算法
def viterbi(self, sent): n = len(sent) nt = self.ntags y = [None] * n path = [[[0]*nt for i in range(nt)] for j in range(n-1)] val = [[[0]*nt for i in range(nt)] for j in range(n-1)] # 如果句子只有一个单词,则单独处理 if (n == 1): max_val = -100000 for v in range(nt): tmp = self.tr_prob[nt][nt][v] * self.em_prob[v][self.word2num[sent[0]]] * self.tr_prob[nt][v][nt] if tmp > max_val: max_val = tmp y[0] = v return [self.num2tag[y[0]]] # 句子开头 for u in range(nt): for v in range(nt): val[0][u][v] = self.tr_prob[nt][nt][u] * self.em_prob[u][self.word2num[sent[0]]] * \ self.tr_prob[nt][u][v] * self.em_prob[v][self.word2num[sent[1]]] path[0][u][v] = -1 # 动态规划求解 for k in range(1, n-1): for u in range(nt): for v in range(nt): max_val = -100000 best_tag = -1 for w in range(nt): tmp = val[k-1][w][u] * self.tr_prob[w][u][v] * self.em_prob[v][self.word2num[sent[k+1]]] if tmp > max_val: max_val = tmp best_tag = w val[k][u][v] = max_val path[k][u][v] = best_tag # 结尾 max_val = -100000 for u in range(nt): for v in range(nt): tmp = val[n-2][u][v] * self.tr_prob[u][v][nt] if tmp > max_val: max_val = tmp y[-1] = v; y[-2] = u # 找到最佳标注 for k in range(n-3, -1, -1): y[k] = path[k+1][y[k+1]][y[k+2]] return [self.num2tag[t] for t in y]
浙公网安备 33010602011771号