MindSpore实现语音指令识别(迁移tf入门教程)
概述
语音识别是人工智能的一个重要领域。这里我实现的是孤立词语音识别(英语),把tf教程的例子,用MindSpore实现。https://tensorflow.google.cn/tutorials/audio/simple_audio
环境准备
from mindspore import context
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
我是在x86, windows操作系统下完成的。
生成数据集
数据集是http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip
每个音频文件1s,一个单词。每个单词都有1000个录音。
训练的数据就是音频文件,但是需要先转换成频谱信息。将音频分帧,每一帧加Hamming窗后做短时傅里叶变换,得到频谱,再把每一帧的频谱拼在一起,就得到了语谱图。由于MindSpore还没有tf.signal.stft对标的算子,这里先用np实现。
import numpy as np
import scipy.io.wavfile as wav
def get_spectrogram(file_path):
fs, waveform = wav.read(file_path)
spectrogram = np.zeros([124, 129]).astype(np.float32)
zero_padding = np.zeros([16000 - waveform.shape[0]], dtype=np.float32)
waveform = waveform.astype(np.float32)
equal_length = np.concatenate([waveform, zero_padding])
x = np.linspace(0, 254, 255, dtype=np.int64)
window = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (255 - 1))
for i in range(124):
p_start = i * 128
p_end = p_start + 255
frame_data = equal_length[p_start:p_end]
frame_data = frame_data * window
spectrum = np.fft.rfft(frame_data, n=256)
spectrogram = np.abs(spectrum)
return spectrogram
标注,由于只有一个单词,可以直接通过音频文件所在路径获得。
为了复现效果,我把tf划分训练、验证、测试,并打散后的数据顺序保存下来,用于训练的文件名保存在train_file.txt中。
我借用了tf的代码
import os
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
#from IPython import display
# Set seed for experiment reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
data_dir = pathlib.Path('data/mini_speech_commands')
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[commands != 'README.md']
#print('Commands:', commands)
filenames = tf.io.gfile.glob(str(data_dir) + '/*/*')
filenames = tf.random.shuffle(filenames)
num_samples = len(filenames)
train_files = filenames[:6400]
val_files = filenames[6400: 6400 + 800]
test_files = filenames[-800:]
# eager tensor
for f in train_files:
s = str(f.numpy())
print(s[2:-1])
然后把输出重定向到txt文件就好了。
具体要识别的就是这8个词。
commands = ['yes', 'no', 'up', 'down', 'right', 'left', 'go', 'stop']
def get_data():
with open('train_file.txt', 'r', encoding='utf8') as f:
files = f.readlines()
for line in files:
line = line.strip()
data = get_spectrogram(line)
label = line.split('\\\\')[-2]
label_id = commands.index(label)
yield data, label_id
数据增强
与tf一样,batch_size=64
import mindspore.dataset as ds
ds_train = ds.GeneratorDataset(list(get_data()), column_names=['data', 'label'] )
ds_train = ds_train.batch(64)
定义网络
把音频转换为语谱图后,其实就是把语音问题转化为图像问题,图像中就包含了声音的特征,这里定义了一个CV类的网络。
tf2中有些算子不知道该对应哪个算子,于是凭经验做了一些替换:resize, Normalization,其他还是一样的。
from mindspore.nn import Conv2d
from mindspore.nn import MaxPool2d
def conv2d(in_channels, out_channels):
return Conv2d(in_channels=in_channels, out_channels=out_channels,
kernel_size=3, stride=1, pad_mode='valid',
has_bias=True, weight_init='he_normal')
def maxpool():
return MaxPool2d(kernel_size=(2, 2), stride=(2, 2), pad_mode='valid')
from mindspore.nn import Cell
import mindspore.ops.operations as P
from mindspore.nn import Dense
from mindspore.nn import ReLU
from mindspore.nn import Flatten
from mindspore.nn import Dropout
from mindspore.nn import BatchNorm2d
class Net(Cell):
def __init__(self, batch_size):
super(Net, self).__init__()
self.batch_size = batch_size
self.reshape = P.Reshape()
self.resize = P.ResizeNearestNeighbor(size=(32, 32))
self.norm = BatchNorm2d(num_features=1)
self.conv1 = conv2d(1, 32)
self.relu1 = ReLU()
self.conv2 = conv2d(32, 64)
self.relu2 = ReLU()
self.maxpool = maxpool()
self.dropout1 = Dropout(keep_prob=0.25)
self.flatten = Flatten()
self.dense1 = Dense(in_channels=12544, out_channels=128)
self.relu3 = ReLU()
self.dropout2 = Dropout(keep_prob=0.5)
self.dense2 = Dense(in_channels=128, out_channels=8)
def construct(self, input_x):
x = self.reshape(input_x, (self.batch_size, 1, 124, 129))
x = self.resize(x)
x = self.norm(x)
x = self.conv1(x)
x = self.relu1(x)
x = self.conv2(x)
x = self.relu2(x)
x = self.maxpool(x)
x = self.dropout1(x)
x = self.flatten(x)
x = self.dense1(x)
x