RNN循环神经网络根据不同国家特点和首字母生成名字name

数据地址:https://download.pytorch.org/tutorial/data.zip

  1 """
  2 @File    :HomeWork6_RNN_names.py
  3 @Author  : silvan
  4 @Time    : 2022/11/28 10:56
  5 本文对应官网地址:https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
  6 不同国家名字特点+首字符 => 生成英文名字
  7 参考博客:https://blog.csdn.net/weixin_30731305/article/details/96217033
  8 """
  9 
 10 # 数据处理##################################################################################
 11 from __future__ import unicode_literals, print_function, division
 12 from io import open
 13 import glob
 14 import os
 15 import unicodedata
 16 import string
 17 from torch.nn.modules import rnn
 18 
 19 # 姓名文件夹 data/names/[language].txt ,每个姓名一行。我们将它转化为一个 array,
 20 # 转为ASCII字符,最后生成一个字典 {language: [name1, name2,...]}
 21 all_letters = string.ascii_letters + " .,;'-"
 22 n_letters = len(all_letters) + 1  # Plus EOS marker
 23 
 24 def findFiles(path): return glob.glob(path)
 25 
 26 # Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
 27 def unicodeToAscii(s):
 28     return ''.join(
 29         c for c in unicodedata.normalize('NFD', s)
 30         if unicodedata.category(c) != 'Mn'
 31         and c in all_letters
 32     )
 33 
 34 
 35 # Read a file and split into lines
 36 def readLines(filename):
 37     lines = open(filename, encoding='utf-8').read().strip().split('\n')
 38     return [unicodeToAscii(line) for line in lines]
 39 
 40 
 41 # Build the category_lines dictionary, a list of lines per category
 42 category_lines = {}
 43 all_categories = []
 44 for filename in findFiles('data/names/*.txt'):
 45     category = os.path.splitext(os.path.basename(filename))[0]
 46     all_categories.append(category)
 47     lines = readLines(filename)
 48     category_lines[category] = lines
 49 
 50 n_categories = len(all_categories)
 51 
 52 if n_categories == 0:
 53     raise RuntimeError('Data not found. Make sure that you downloaded data '
 54                        'from https://download.pytorch.org/tutorial/data.zip and extract it to '
 55                        'the current directory.')
 56 
 57 print('# categories:', n_categories, all_categories)
 58 print(unicodeToAscii("O'Néàl"))
 59 
 60 # 模型网络搭建#######################################################################################
 61 import torch
 62 import torch.nn as nn
 63 
 64 
 65 class RNN(nn.Module):
 66     def __init__(self, input_size, hidden_size, output_size):
 67         super(RNN, self).__init__()
 68         self.hidden_size = hidden_size
 69 
 70         self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
 71         self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
 72         self.o2o = nn.Linear(hidden_size + output_size, output_size)
 73         self.dropout = nn.Dropout(0.1)
 74         self.softmax = nn.LogSoftmax(dim=1)
 75 
 76     def forward(self, category, input, hidden):
 77         input_combined = torch.cat([category, input, hidden], dim=1)
 78         hidden = self.i2h(input_combined)
 79         output = self.i2o(input_combined)
 80         output_combined = torch.cat([hidden, output], 1)
 81         output = self.o2o(output_combined)
 82         output = self.dropout(output)
 83         output = self.softmax(output)
 84         return output, hidden
 85 
 86     def initHidden(self):
 87         return torch.zeros(1, self.hidden_size)
 88 
 89 # 训练 ################################################################################
 90 # 训练准备###############
 91 import random
 92 
 93 
 94 # Random item from a list
 95 def randomChoice(l):
 96     return l[random.randint(0, len(l) - 1)]
 97 
 98 
 99 # Get a random category and random line from that category
100 def randomTrainingPair():
101     category = randomChoice(all_categories)
102     line = randomChoice(category_lines[category])
103     return category, line
104 
105 
106 # one-hot vector for category
107 def categoryTensor(category):
108     li = all_categories.index(category)
109     tensor = torch.zeros(1, n_categories)
110     tensor[0][li] = 1
111     return tensor
112 
113 
114 # one-hot matrix of first to last letters (not including EOS) for input
115 def inputTensor(line):
116     tensor = torch.zeros(len(line), 1, n_letters)
117     for li in range(len(line)):
118         letter = line[li]
119         tensor[li][0][all_letters.find(letter)] = 1
120     return tensor
121 
122 
123 # LongTensor of second letter to end(EOS) for target
124 def targetTensor(line):
125     letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
126     letter_indexes.append(n_letters - 1)  # EOS
127     return torch.LongTensor(letter_indexes)
128 
129 
130 # make category, input, and target tensors from a random category, line pair
131 def randomTrainingExample():
132     category, line = randomTrainingPair()
133     category_tensor = categoryTensor(category)
134     input_line_tensor = inputTensor(line)
135     target_line_tensor = targetTensor(line)
136     return category_tensor, input_line_tensor, target_line_tensor
137 
138 # 训练网络######################
139 criterion = nn.NLLLoss()
140 
141 learning_rate = 0.0005
142 
143 
144 def train(category_tensor, input_line_tensor, target_line_tensor):
145     target_line_tensor.unsqueeze_(-1)
146     hidden = rnn.initHidden()
147 
148     rnn.zero_grad()
149 
150     loss = 0
151 
152     for i in range(input_line_tensor.size(0)):
153         output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
154         l = criterion(output, target_line_tensor[i])
155         loss += l
156 
157     loss.backward()
158 
159     for p in rnn.parameters():
160         p.data.add_(p.grad.data, alpha=-learning_rate)
161 
162     return output, loss.item() / input_line_tensor.size(0)
163 
164 
165 import time
166 import math
167 
168 
169 def timeSince(since):
170     now = time.time()
171     s = now - since
172     m = math.floor(s / 60)
173     s -= m * 60
174     return '%dm %ds' % (m, s)
175 
176 
177 rnn = RNN(n_letters, 128, n_letters)
178 
179 n_iters = 100000
180 print_every = 5000
181 plot_every = 500
182 all_losses = []
183 total_loss = 0  # Reset every plot_every iters
184 
185 start = time.time()
186 
187 for iter in range(1, n_iters + 1):
188     output, loss = train(*randomTrainingExample())
189     total_loss += loss
190 
191     if iter % print_every == 0:
192         print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))
193 
194     if iter % plot_every == 0:
195         all_losses.append(total_loss / plot_every)
196         total_loss = 0
197 # 打印损失###############
198 import matplotlib.pyplot as plt
199 
200 plt.figure()
201 plt.plot(all_losses)
202 plt.show()
203 
204 #网络示例######################################################################################
205 max_length = 20
206 
207 
208 # sample from a category and starting letter
209 def sample(category, start_letter='A'):
210     with torch.no_grad():  # no need to track history in sampling
211         category_tensor = categoryTensor(category)
212         input = inputTensor(start_letter)
213         hidden = rnn.initHidden()
214 
215         output_name = start_letter
216 
217         for i in range(max_length):
218             output, hidden = rnn(category_tensor, input[0], hidden)
219             topv, topi = output.topk(1)
220             topi = topi[0][0]
221             if topi == n_letters - 1:
222                 break
223             else:
224                 letter = all_letters[topi]
225                 output_name += letter
226             input = inputTensor(letter)
227 
228         return output_name
229 
230 
231 # get multiple samples from one category and multiple starting letters
232 def samples(category, start_letters='ABC'):
233     for start_letter in start_letters:
234         print(sample(category, start_letter))
235 
236 
237 samples('Russian', 'RUS')
238 samples('German', 'GER')
239 samples('Spanish', 'SPA')
240 samples('Irish', 'O')

结果:

D:\program\anaconda\envs\nlp\python.exe

D:\learn\PycharmProject\202210\pytorch\HomeWork6_RNN_names.py

# categories: 18 ['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']

O'Neal

0m 8s (5000 5%) 3.0088

0m 15s (10000 10%) 3.1125

0m 23s (15000 15%) 3.3304

0m 31s (20000 20%) 2.5129

0m 39s (25000 25%) 2.2520

0m 47s (30000 30%) 2.5019

0m 54s (35000 35%) 1.9176

1m 2s (40000 40%) 2.3475

1m 10s (45000 45%) 2.4141

1m 18s (50000 50%) 2.1674

1m 26s (55000 55%) 2.3683

1m 34s (60000 60%) 2.5937

1m 41s (65000 65%) 3.1281

1m 48s (70000 70%) 2.8330

1m 55s (75000 75%) 3.2577

2m 2s (80000 80%) 1.8182

2m 9s (85000 85%) 2.2047

2m 16s (90000 90%) 1.6273

2m 23s (95000 95%) 1.5616

2m 30s (100000 100%) 1.6004

Rovallov

Uallov

Sakinov

Gerter

Erere

Roure

Salla

Para

Alana

Olinan

 

Process finished with exit code 0

 

posted @ 2022-11-28 11:36  silvan_happy  阅读(73)  评论(0编辑  收藏  举报