################################################################################
# The Neural Network (NN) based Speech Synthesis System
# https://svn.ecdf.ed.ac.uk/repo/inf/dnn_tts/
#
# Centre for Speech Technology Research
# University of Edinburgh, UK
# Copyright (c) 2014-2015
# All Rights Reserved.
#
# The system as a whole and most of the files in it are distributed
# under the following copyright and conditions
#
# Permission is hereby granted, free of charge, to use and distribute
# this software and its documentation without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this work, and to
# permit persons to whom this work is furnished to do so, subject to
# the following conditions:
#
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# - The authors' names may not be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK
# DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
# SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE
# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
# ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
# THIS SOFTWARE.
################################################################################
import numpy, theano, random
from io_funcs.binary_io import BinaryIOCollection
import logging
[docs]class ListDataProvider(object):
""" This class provides an interface to load data into CPU/GPU memory utterance by utterance or block by block.
In speech synthesis, usually we are not able to load all the training data/evaluation data into RAMs, we will do the following three steps:
- Step 1: a data provide will load part of the data into a buffer
- Step 2: training a DNN by using the data from the buffer
- Step 3: Iterate step 1 and 2 until all the data are used for DNN training. Until now, one epoch of DNN training is finished.
The utterance-by-utterance data loading will be useful when sequential training is used, while block-by-block loading will be used when the order of frames is not important.
This provide assumes binary format with float32 precision without any header (e.g. HTK header).
"""
[docs] def __init__(self, x_file_list, y_file_list, n_ins=0, n_outs=0, buffer_size = 500000, sequential=False, shuffle=False):
"""Initialise a data provider
:param x_file_list: list of file names for the input files to DNN
:type x_file_list: python list
:param y_file_list: list of files for the output files to DNN
:param n_ins: the dimensionality for input feature
:param n_outs: the dimensionality for output features
:param buffer_size: the size of the buffer, indicating the number of frames in the buffer. The value depends on the memory size of RAM/GPU.
:param shuffle: True/False. To indicate whether the file list will be shuffled. When loading data block by block, the data in the buffer will be shuffle no matter this value is True or False.
"""
self.logger = logging.getLogger("ListDataProvider")
self.n_ins = n_ins
self.n_outs = n_outs
self.buffer_size = buffer_size
self.sequential = sequential
#remove potential empty lines and end of line signs
try:
assert len(x_file_list) > 0
except AssertionError:
self.logger.critical('first list is empty')
raise
try:
assert len(y_file_list) > 0
except AssertionError:
self.logger.critical('second list is empty')
raise
try:
assert len(x_file_list) == len(y_file_list)
except AssertionError:
self.logger.critical('two lists are of differing lengths: %d versus %d',len(x_file_list),len(y_file_list))
raise
self.x_files_list = x_file_list
self.y_files_list = y_file_list
self.logger.debug('first list of items from ...%s to ...%s' % (self.x_files_list[0].rjust(20)[-20:],self.x_files_list[-1].rjust(20)[-20:]) )
self.logger.debug('second list of items from ...%s to ...%s' % (self.y_files_list[0].rjust(20)[-20:],self.y_files_list[-1].rjust(20)[-20:]) )
if shuffle:
random.seed(271638)
random.shuffle(self.x_files_list)
random.seed(271638)
random.shuffle(self.y_files_list)
self.file_index = 0
self.list_size = len(self.x_files_list)
self.remain_data_x = numpy.empty((0, self.n_ins))
self.remain_data_y = numpy.empty((0, self.n_outs))
self.remain_frame_number = 0
self.end_reading = False
self.logger.debug('initialised')
def __iter__(self):
return self
[docs] def reset(self):
"""When all the files in the file list have been used for DNN training, reset the data provider to start a new epoch.
"""
self.file_index = 0
self.end_reading = False
self.remain_frame_number = 0
self.logger.debug('reset')
[docs] def make_shared(self, data_set, data_name):
"""To make data shared for theano implementation. If you want to know why we make it shared, please refer the theano documentation: http://deeplearning.net/software/theano/library/compile/shared.html
:param data_set: normal data in CPU memory
:param data_name: indicate the name of the data (e.g., 'x', 'y', etc)
:returns: shared dataset -- data_set
"""
data_set = theano.shared(numpy.asarray(data_set, dtype=theano.config.floatX), name=data_name, borrow=True)
return data_set
def load_one_partition(self):
if self.sequential:
shared_set_xy, temp_set_x, temp_set_y = self.load_next_utterance()
else:
shared_set_xy, temp_set_x, temp_set_y = self.load_next_partition()
return shared_set_xy, temp_set_x, temp_set_y
[docs] def load_next_utterance(self):
"""Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).
"""
temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
temp_set_y = numpy.empty((self.buffer_size, self.n_outs))
io_fun = BinaryIOCollection()
in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)
frame_number = lab_frame_number
if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
if lab_frame_number > out_frame_number:
frame_number = out_frame_number
else:
self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d" %(lab_frame_number, out_frame_number))
raise
temp_set_y = out_features[0:frame_number, ]
temp_set_x = in_features[0:frame_number, ]
self.file_index += 1
if self.file_index >= self.list_size:
self.end_reading = True
self.file_index = 0
shared_set_x = self.make_shared(temp_set_x, 'x')
shared_set_y = self.make_shared(temp_set_y, 'y')
shared_set_xy = (shared_set_x, shared_set_y)
return shared_set_xy, temp_set_x, temp_set_y
[docs] def load_next_partition(self):
"""Load one block data. The number of frames will be the buffer size set during intialisation.
"""
self.logger.debug('loading next partition')
temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
temp_set_y = numpy.empty((self.buffer_size, self.n_outs))
current_index = 0
### first check whether there are remaining data from previous utterance
if self.remain_frame_number > 0:
temp_set_x[current_index:self.remain_frame_number, ] = self.remain_data_x
temp_set_y[current_index:self.remain_frame_number, ] = self.remain_data_y
current_index += self.remain_frame_number
self.remain_frame_number = 0
io_fun = BinaryIOCollection()
while True:
if current_index >= self.buffer_size:
break
if self.file_index >= self.list_size:
self.end_reading = True
self.file_index = 0
break
in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)
frame_number = lab_frame_number
if abs(lab_frame_number - out_frame_number) < 5: ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
if lab_frame_number > out_frame_number:
frame_number = out_frame_number
else:
self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d" %(lab_frame_number, out_frame_number))
print lab_frame_number
print out_frame_number
raise
out_features = out_features[0:frame_number, ]
in_features = in_features[0:frame_number, ]
if current_index + frame_number <= self.buffer_size:
temp_set_x[current_index:current_index+frame_number, ] = in_features
temp_set_y[current_index:current_index+frame_number, ] = out_features
current_index = current_index + frame_number
else: ## if current utterance cannot be stored in the block, then leave the remaining part for the next block
used_frame_number = self.buffer_size - current_index
temp_set_x[current_index:self.buffer_size, ] = in_features[0:used_frame_number, ]
temp_set_y[current_index:self.buffer_size, ] = out_features[0:used_frame_number, ]
current_index = self.buffer_size
self.remain_data_x = in_features[used_frame_number:frame_number, ]
self.remain_data_y = out_features[used_frame_number:frame_number, ]
self.remain_frame_number = frame_number - used_frame_number
self.file_index += 1
temp_set_x = temp_set_x[0:current_index, ]
temp_set_y = temp_set_y[0:current_index, ]
numpy.random.seed(271639)
numpy.random.shuffle(temp_set_x)
numpy.random.seed(271639)
numpy.random.shuffle(temp_set_y)
shared_set_x = self.make_shared(temp_set_x, 'x')
shared_set_y = self.make_shared(temp_set_y, 'y')
shared_set_xy = (shared_set_x, shared_set_y)
# temp_set_x = self.make_shared(temp_set_x, 'x')
# temp_set_y = self.make_shared(temp_set_y, 'y')
return shared_set_xy, temp_set_x, temp_set_y
def is_finish(self):
return self.end_reading
class ListDataProviderWithProjectionIndex(ListDataProvider):
'''
Added kwarg index_to_project to __init__
'''
def __init__(self, x_file_list, y_file_list, n_ins=0, n_outs=0, \
buffer_size = 500000, shuffle=False, index_to_project=1, projection_insize=10000, indexes_only=False):
##ListDataProvider.__init__(x_file_list, \
## y_file_list, n_ins=0, n_outs=0, buffer_size = 500000, shuffle=False)
super( ListDataProviderWithProjectionIndex, self ).__init__(x_file_list, \
y_file_list, n_ins=n_ins, n_outs=n_outs, buffer_size=buffer_size, shuffle=shuffle)
self.index_to_project = index_to_project
self.projection_insize = projection_insize
self.indexes_only = indexes_only
def load_next_partition_with_projection(self):
shared_set_xy, temp_set_x, temp_set_y = self.load_next_partition()
if self.indexes_only:
temp_set_x, p_indexes = get_unexpanded_projection_inputs(temp_set_x, self.index_to_project, \
self.projection_insize)
shared_set_x_proj = theano.shared(p_indexes, name='x_proj', borrow=True)
else:
temp_set_x, one_hot = expand_projection_inputs(temp_set_x, self.index_to_project, \
self.projection_insize)
shared_set_x_proj = self.make_shared(one_hot, 'x_proj')
shared_set_x = self.make_shared(temp_set_x, 'x')
shared_set_y = self.make_shared(temp_set_y, 'y')
shared_set_xy = (shared_set_x, shared_set_x_proj, shared_set_y)
if self.indexes_only:
return shared_set_xy, temp_set_x, p_indexes, temp_set_y
else:
return shared_set_xy, temp_set_x, one_hot, temp_set_y
## Put this function at global level so it can be imported for use in dnn_generation
def expand_projection_inputs(temp_set_x, index_to_project, projection_insize):
## Turn indexes to words, syllables etc. to one-hot data:
m,n = numpy.shape(temp_set_x)
projection_indices = temp_set_x[:, index_to_project]
#print projection_indices.tolist()
assert projection_indices.max() < projection_insize,'projection_insize is %s but there is an index %s in the data'%(projection_insize, projection_indices.max())
one_hot = numpy.zeros((m, projection_insize))
## Used advanced indexing to turn the relevant features on:
projection_indices = projection_indices.astype(int) ## check conversion???!?!?!
# print projection_indices.tolist()
# print ' ^--- proj indices'
# print
one_hot[range(m),projection_indices] = 1.0
## Effectively remove the index from the original data by setting to 0:
temp_set_x[:, index_to_project] = 0.0
return temp_set_x, one_hot
def get_unexpanded_projection_inputs(temp_set_x, index_to_project, projection_insize):
## Turn indexes to words, syllables etc. to one-hot data:
m,n = numpy.shape(temp_set_x)
projection_indices = temp_set_x[:, index_to_project]
#print projection_indices.tolist()
assert projection_indices.max() < projection_insize,'projection_insize is %s but there is an index %s in the data'%(projection_insize, projection_indices.max())
projection_indices = projection_indices.astype('int32') ## check conversion???!?!?!
temp_set_x[:, index_to_project] = 0.0
return temp_set_x, projection_indices