小言_互联网的博客

语音识别流程总结

272人阅读  评论(0)

比较简单的语音识别问题是通过语音识别声源的情感、类型、属于谁,属于分类问题,因此则可以转化为分类问题。

下面以常用的CASIA语音数据集识别作为例子,总结一下语音识别的总流程。

数据集文档是这样的:

导入第三方库


  
  1. #数据读取
  2. import pandas as pd
  3. import numpy
  4. # 特征提取
  5. import librosa
  6. import librosa.display
  7. #绘图
  8. import matplotlib.pyplot as plt
  9. #时间进度
  10. from tqdm import tqdm
  11. from time import time
  12. #模型构建
  13. import sklearn
  14. import tensorflow as tf
  15. from tensorflow.keras.preprocessing import sequence
  16. from tensorflow.keras.models import Sequential
  17. from tensorflow.keras.layers import Dense, Embedding
  18. from tensorflow.keras.layers import LSTM
  19. from tensorflow.keras.preprocessing.text import Tokenizer
  20. from tensorflow.keras.preprocessing.sequence import pad_sequences
  21. from tensorflow.keras.utils import to_categorical
  22. from tensorflow.keras.layers import Input, Flatten, Dropout, Activation
  23. from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
  24. from tensorflow.keras.models import Model
  25. from tensorflow.keras.callbacks import ModelCheckpoint
  26. #系统
  27. import os
  28. import datetime
  29. from keras.utils import np_utils
  30. from sklearn.preprocessing import LabelEncoder
  31. #打印时间
  32. def printbar():
  33. nowtime = datetime.datetime.now().strftime('% Y-%m-%d % H:% M:% S')
  34. print( "\n"+ "=========="* 8 + "%s"%nowtime)
  35. #mac系统上pytorch和matplotlib在jupyter中同时跑需要更改环境变量
  36. #os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

数据预处理

提取音频特征这一块librosa库,可抽取不同特征,这里就将该问题简化,只提取简单的特征。

读取音频文件


  
  1. # -*- coding: utf -8 -*-
  2. # 读取文件夹地址
  3. import os
  4. #以递归方式查找所有的音频文件
  5. def listdir( path,list_name):
  6. # list_name=list()
  7. label_list=list()
  8. for file in os.listdir( path):
  9. file_path = os. path.join( path, file)
  10. if os. path.isdir(file_path):
  11. listdir(file_path, list_name)
  12. elif os. path.splitext(file_path)[ 1]== '.wav':
  13. list_name.append(file_path)
  14. list_name=list()
  15. listdir( 'RawData/CASIA database',list_name)

获取音频类型标签


  
  1. label_list=list()
  2. name_list=[ "liuchanhg", "wangzhe", "zhaoquanyin", "ZhaoZuoxiang"]
  3. number= range( len(name_list))
  4. d=dict(zip(name_list,number))
  5. for i in range( len(list_name)):
  6. # print(list_name)
  7. name=list_name[i]
  8. if "liuchanhg" in name:
  9. label_list. append( 0)
  10. if "wangzhe" in name:
  11. label_list. append( 1)
  12. if "zhaoquanyin" in name:
  13. label_list. append( 2)
  14. if "ZhaoZuoxiang" in name:
  15. label_list. append( 3)

音频特征提取


  
  1. labels = pd.DataFrame(label_list)
  2. t = time()
  3. df = pd.DataFrame(columns=['feature'])
  4. bookmark= 0
  5. for index,y in enumerate(tqdm(list_name)):
  6. # print(y)
  7. X, sample_rate = librosa.load(y, res_type='kaiser_fast'
  8. ,duration= 2.5,sr=22050*2,offset=0.5)
  9. sample_rate = np.array(sample_rate)
  10. mfccs = np.mean(librosa.feature.mfcc(y=X,
  11. sr= sample_rate,
  12. n_mfcc= 13),
  13. axis= 0)
  14. feature = mfccs
  15. #[float(i) for i in feature]
  16. #feature1=feature[:135]
  17. df.loc[bookmark] = [feature]
  18. bookmark= bookmark+1
  19. print('提取特征所需的时间: {} mins'.format(round((time() - t) / 60, 2)))
  20. #标签和特征组合
  21. newdf = pd.DataFrame(df['feature'].values.tolist())
  22. newdf["label"]= label_list
  23. newdf.head()
  24. #补0
  25. newdf= newdf.fillna(0)

 

管道构建


  
  1. from sklearn.model_selection import train_test_split
  2. X1=np.array(newdf.iloc[:, : -1])
  3. X=np.expand_dims( X1, axis= 2)
  4. y_tmp=np.array(newdf.iloc[:, -1:])
  5. lb = LabelEncoder()
  6. y=np_utils.to_categorical(lb.fit_transform(y_tmp))
  7. X_train, X_test,y_train,y_test=train_test_split( X,y,test_size= 0.2,random_state= 22)
  8. BATCH_SIZE= 32
  9. ds_train = tf. data.Dataset.from_tensor_slices((X_train,y_train)) \
  10. .shuffle(buffer_size = 1000).batch( BATCH_SIZE) \
  11. .prefetch(tf. data.experimental.AUTOTUNE).cache()
  12. ds_test = tf. data.Dataset.from_tensor_slices((X_test,y_test)) \
  13. .shuffle(buffer_size = 1000).batch( BATCH_SIZE) \
  14. .prefetch(tf. data.experimental.AUTOTUNE).cache()

模型搭建 


  
  1. model = Sequential()
  2. model. add(Conv1D( 256, 5,padding= 'same',
  3. input_shape=( 216, 1)))
  4. model. add(Activation( 'relu'))
  5. model. add(Conv1D( 128, 5,padding= 'same'))
  6. model. add(Activation( 'relu'))
  7. model. add(Dropout( 0.1))
  8. model. add(MaxPooling1D(pool_size=( 8)))
  9. model. add(Conv1D( 128, 5,padding= 'same',))
  10. model. add(Activation( 'relu'))
  11. #model.add(Conv1D(128, 5,padding='same',))
  12. #model.add(Activation('relu'))
  13. #model.add(Conv1D(128, 5,padding='same',))
  14. #model.add(Activation('relu'))
  15. #model.add(Dropout(0.2))
  16. model. add(Conv1D( 128, 5,padding= 'same',))
  17. model. add(Activation( 'relu'))
  18. model. add(Flatten())
  19. model. add(Dense(len(d)))
  20. model. add(Activation( 'softmax'))
  21. model.summary()

训练


  
  1. from tensorflow.keras.optimizers import RMSprop
  2. opt =RMSprop(lr= 0. 00001, decay= 1e- 6)
  3. model.compile(loss= 'categorical_crossentropy', optimizer=opt,metrics=[ 'accuracy'])
  4. import os
  5. import datetime
  6. # optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
  7. # model.compile(optimizer=optimizer,loss=MSPE(name = "MSPE"))
  8. stamp = datetime.datetime.now().strftime( "%Y%m%d-%H%M%S")
  9. logdir = os.path.join( 'model', 'autograph', stamp)
  10. ## 在 Python3 下建议使用 pathlib 修正各操作系统的路径
  11. # from pathlib import Path
  12. # stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  13. # logdir = str(Path('./data/autograph/' + stamp))
  14. tb_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq= 1)
  15. #如果loss在50个epoch后没有提升,学习率减半。
  16. lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor= "loss",factor = 0. 5, patience = 50)
  17. #当accuracy在500个epoch后没有提升,则提前终止训练。
  18. stop_callback = tf.keras.callbacks.EarlyStopping(monitor = "accuracy", patience= 500)
  19. # 在每个训练期之后保存模型
  20. save_dir = os.path.join(os.getcwd(), 'model\\speech\\saved_models')
  21. filepath= "model_{epoch:02d}-{val_acc:.2f}.hdf5"
  22. # mc_callback=
  23. # tf.keras.callbacks.ModelCheckpoint(os.path.join(save_dir, filepath)
  24. # , monitor='val_loss',
  25. # verbose=0, save_best_only=False,
  26. # save_weights_only=False,
  27. # mode='auto', period=1)
  28. mc_callback=tf.keras.callbacks.ModelCheckpoint(filepath= '/model/speech/weights.hdf5',
  29. monitor= 'val_loss',
  30. verbose= 1, save_best_only=True)
  31. # 把训练轮结果数据流到 csv 文件的回调函数。
  32. csv_callback=tf.keras.callbacks.CSVLogger( './model/speech/training.log')
  33. callbacks_list = [tb_callback,lr_callback,stop_callback,mc_callback,csv_callback]
  34. history = model.fit(ds_train,validation_data=ds_test,epochs= 200,callbacks = callbacks_list)

结果检查

1.检查误差损失和准确率


  
  1. %matplotlib inline
  2. %config InlineBackend.figure_format = 'svg'
  3. import matplotlib.pyplot as plt
  4. def plot_metric(history, metric):
  5. train_metrics = history.history[metric]
  6. val_metrics = history.history[ 'val_'+metric]
  7. epochs = range( 1, len(train_metrics) + 1)
  8. plt.plot(epochs, train_metrics, 'bo--')
  9. plt.plot(epochs, val_metrics, 'ro-')
  10. plt.title( 'Training and validation '+ metric)
  11. plt.xlabel( "Epochs")
  12. plt.ylabel(metric)
  13. plt.legend([ "train_"+metric, 'val_'+metric])
  14. plt.show()

(1)误差损失

plot_metric(history,"loss")

(2)准确率

plot_metric(history,"accuracy")

 

2.表格表示的运算结果


  
  1. dfhistory = pd.DataFrame(history.history)
  2. dfhistory. index = range( 1,len(dfhistory) + 1)
  3. dfhistory. index. name = 'epoch'
  4. dfhistory

模型保存


  
  1. #获取模型名称
  2. model_name = 'Emotion_Voice_Detection_Model20201122.h5'
  3. # 获取保存位置,当前位置+saved_models
  4. save_dir = os. path.join( os.getcwd(), 'saved_models')
  5. # Save model and weights
  6. # 如果不存在,那么重新生成一个
  7. if not os. path.isdir(save_dir):
  8. os.makedirs(save_dir)
  9. #模型地址
  10. model_path = os. path.join(save_dir, model_name)
  11. #模型保存
  12. model.save(model_path)
  13. print( 'Saved trained model at %s ' % model_path)

 


转载:https://blog.csdn.net/yingdajun/article/details/116274146
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场