Source code for utils.providers

################################################################################
#           The Neural Network (NN) based Speech Synthesis System
#                https://svn.ecdf.ed.ac.uk/repo/inf/dnn_tts/
#                
#                Centre for Speech Technology Research                 
#                     University of Edinburgh, UK                       
#                      Copyright (c) 2014-2015
#                        All Rights Reserved.                           
#                                                                       
# The system as a whole and most of the files in it are distributed
# under the following copyright and conditions
#
#  Permission is hereby granted, free of charge, to use and distribute  
#  this software and its documentation without restriction, including   
#  without limitation the rights to use, copy, modify, merge, publish,  
#  distribute, sublicense, and/or sell copies of this work, and to      
#  permit persons to whom this work is furnished to do so, subject to   
#  the following conditions:
#  
#   - Redistributions of source code must retain the above copyright  
#     notice, this list of conditions and the following disclaimer.   
#   - Redistributions in binary form must reproduce the above         
#     copyright notice, this list of conditions and the following     
#     disclaimer in the documentation and/or other materials provided 
#     with the distribution.                                          
#   - The authors' names may not be used to endorse or promote products derived 
#     from this software without specific prior written permission.   
#                                  
#  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        
#  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      
#  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   
#  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     
#  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    
#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   
#  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          
#  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       
#  THIS SOFTWARE.
################################################################################


import numpy, theano, random
from io_funcs.binary_io import BinaryIOCollection
import logging

[docs]class ListDataProvider(object):
    """ This class provides an interface to load data into CPU/GPU memory utterance by utterance or block by block.
    
    In speech synthesis, usually we are not able to load all the training data/evaluation data into RAMs, we will do the following three steps:
    
    - Step 1: a data provide will load part of the data into a buffer
    
    - Step 2: training a DNN by using the data from the buffer
    
    - Step 3: Iterate step 1 and 2 until all the data are used for DNN training. Until now, one epoch of DNN training is finished.
    
    The utterance-by-utterance data loading will be useful when sequential training is used, while block-by-block loading will be used when the order of frames is not important.
    
    This provide assumes binary format with float32 precision without any header (e.g. HTK header).
    
    """
[docs]    def __init__(self, x_file_list, y_file_list, n_ins=0, n_outs=0, buffer_size = 500000, sequential=False, shuffle=False):
        """Initialise a data provider
        
        :param x_file_list: list of file names for the input files to DNN
        :type x_file_list: python list
        :param y_file_list: list of files for the output files to DNN
        :param n_ins: the dimensionality for input feature
        :param n_outs: the dimensionality for output features
        :param buffer_size: the size of the buffer, indicating the number of frames in the buffer. The value depends on the memory size of RAM/GPU.
        :param shuffle: True/False. To indicate whether the file list will be shuffled. When loading data block by block, the data in the buffer will be shuffle no matter this value is True or False.
        """

        self.logger = logging.getLogger("ListDataProvider")

        self.n_ins = n_ins
        self.n_outs = n_outs

        self.buffer_size = buffer_size
        
        self.sequential = sequential
            
        #remove potential empty lines and end of line signs


        try:
            assert len(x_file_list) > 0
        except AssertionError:
            self.logger.critical('first list is empty')
            raise

        try:
            assert len(y_file_list) > 0
        except AssertionError:
            self.logger.critical('second list is empty')
            raise

        try:
            assert len(x_file_list) == len(y_file_list)
        except AssertionError:
            self.logger.critical('two lists are of differing lengths: %d versus %d',len(x_file_list),len(y_file_list))
            raise

        self.x_files_list = x_file_list
        self.y_files_list = y_file_list
        
        self.logger.debug('first  list of items from ...%s to ...%s' % (self.x_files_list[0].rjust(20)[-20:],self.x_files_list[-1].rjust(20)[-20:]) )
        self.logger.debug('second list of items from ...%s to ...%s' % (self.y_files_list[0].rjust(20)[-20:],self.y_files_list[-1].rjust(20)[-20:]) )
        
        if shuffle: 
            random.seed(271638)
            random.shuffle(self.x_files_list)
            random.seed(271638)
            random.shuffle(self.y_files_list)

        self.file_index = 0
        self.list_size = len(self.x_files_list)

        self.remain_data_x = numpy.empty((0, self.n_ins))
        self.remain_data_y = numpy.empty((0, self.n_outs))
        self.remain_frame_number = 0

        self.end_reading = False
    
        self.logger.debug('initialised')
    
    def __iter__(self):
        return self 
    
[docs]    def reset(self):
        """When all the files in the file list have been used for DNN training, reset the data provider to start a new epoch.
        
        """
        self.file_index = 0
        self.end_reading = False

        self.remain_frame_number = 0

        self.logger.debug('reset')

[docs]    def make_shared(self, data_set, data_name):
        """To make data shared for theano implementation. If you want to know why we make it shared, please refer the theano documentation: http://deeplearning.net/software/theano/library/compile/shared.html
        
        :param data_set: normal data in CPU memory
        :param data_name: indicate the name of the data (e.g., 'x', 'y', etc)
        :returns: shared dataset -- data_set
        """
        data_set = theano.shared(numpy.asarray(data_set, dtype=theano.config.floatX), name=data_name, borrow=True)

        return  data_set

    def load_one_partition(self):
        if self.sequential:
            shared_set_xy, temp_set_x, temp_set_y = self.load_next_utterance()
        else:
            shared_set_xy, temp_set_x, temp_set_y = self.load_next_partition()
            
        return  shared_set_xy, temp_set_x, temp_set_y

[docs]    def load_next_utterance(self):
        """Load the data for one utterance. This function will be called when utterance-by-utterance loading is required (e.g., sequential training).
        
        """

        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))

        io_fun = BinaryIOCollection()

        in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
        out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)
            
        frame_number = lab_frame_number
        if abs(lab_frame_number - out_frame_number) < 5:    ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
            if lab_frame_number > out_frame_number:
                frame_number = out_frame_number
        else:
            self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d" %(lab_frame_number, out_frame_number))
            raise

        temp_set_y = out_features[0:frame_number, ]
        temp_set_x = in_features[0:frame_number, ]

        self.file_index += 1

        if  self.file_index >= self.list_size:
            self.end_reading = True
            self.file_index = 0

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')

        shared_set_xy = (shared_set_x, shared_set_y)

        return shared_set_xy, temp_set_x, temp_set_y
        

[docs]    def load_next_partition(self):
        """Load one block data. The number of frames will be the buffer size set during intialisation.
        
        """
        
        self.logger.debug('loading next partition')
        
        temp_set_x = numpy.empty((self.buffer_size, self.n_ins))
        temp_set_y = numpy.empty((self.buffer_size, self.n_outs))
        current_index = 0

        ### first check whether there are remaining data from previous utterance
        if self.remain_frame_number > 0:
            temp_set_x[current_index:self.remain_frame_number, ] = self.remain_data_x
            temp_set_y[current_index:self.remain_frame_number, ] = self.remain_data_y
            current_index += self.remain_frame_number
            
            self.remain_frame_number = 0
        
        io_fun = BinaryIOCollection()
        while True:
            if current_index >= self.buffer_size:
                break
            if  self.file_index >= self.list_size:
                self.end_reading = True
                self.file_index = 0
                break

            in_features, lab_frame_number = io_fun.load_binary_file_frame(self.x_files_list[self.file_index], self.n_ins)
            out_features, out_frame_number = io_fun.load_binary_file_frame(self.y_files_list[self.file_index], self.n_outs)
            
            frame_number = lab_frame_number
            if abs(lab_frame_number - out_frame_number) < 5:    ## we allow small difference here. may not be correct, but sometimes, there is one/two frames difference
                if lab_frame_number > out_frame_number:
                    frame_number = out_frame_number
            else:
                self.logger.critical("the number of frames in label and acoustic features are different: %d vs %d" %(lab_frame_number, out_frame_number))
                print lab_frame_number
                print out_frame_number
                raise

            out_features = out_features[0:frame_number, ]
            in_features = in_features[0:frame_number, ]

            if current_index + frame_number <= self.buffer_size:   
                temp_set_x[current_index:current_index+frame_number, ] = in_features
                temp_set_y[current_index:current_index+frame_number, ] = out_features
                    
                current_index = current_index + frame_number
            else:   ## if current utterance cannot be stored in the block, then leave the remaining part for the next block
                used_frame_number = self.buffer_size - current_index
                temp_set_x[current_index:self.buffer_size, ] = in_features[0:used_frame_number, ]
                temp_set_y[current_index:self.buffer_size, ] = out_features[0:used_frame_number, ]
                current_index = self.buffer_size

                self.remain_data_x = in_features[used_frame_number:frame_number, ]
                self.remain_data_y = out_features[used_frame_number:frame_number, ]
                self.remain_frame_number = frame_number - used_frame_number

            self.file_index += 1

        temp_set_x = temp_set_x[0:current_index, ]
        temp_set_y = temp_set_y[0:current_index, ]

        numpy.random.seed(271639)
        numpy.random.shuffle(temp_set_x)
        numpy.random.seed(271639)
        numpy.random.shuffle(temp_set_y)

        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')

        shared_set_xy = (shared_set_x, shared_set_y)
#        temp_set_x = self.make_shared(temp_set_x, 'x')
#        temp_set_y = self.make_shared(temp_set_y, 'y')

        return shared_set_xy, temp_set_x, temp_set_y
        
    def is_finish(self):
        return self.end_reading


class ListDataProviderWithProjectionIndex(ListDataProvider):
    '''
    Added kwarg index_to_project to __init__
    '''

    def __init__(self, x_file_list, y_file_list, n_ins=0, n_outs=0, \
            buffer_size = 500000, shuffle=False, index_to_project=1, projection_insize=10000, indexes_only=False):
        ##ListDataProvider.__init__(x_file_list, \
        ##         y_file_list, n_ins=0, n_outs=0, buffer_size = 500000, shuffle=False) 
        super( ListDataProviderWithProjectionIndex, self ).__init__(x_file_list, \
                 y_file_list, n_ins=n_ins, n_outs=n_outs, buffer_size=buffer_size, shuffle=shuffle)                               
        self.index_to_project = index_to_project
        self.projection_insize = projection_insize
        self.indexes_only = indexes_only

    def load_next_partition_with_projection(self):
    
        shared_set_xy, temp_set_x, temp_set_y = self.load_next_partition()
        
        if self.indexes_only:
            temp_set_x, p_indexes = get_unexpanded_projection_inputs(temp_set_x, self.index_to_project, \
                                                            self.projection_insize)
            shared_set_x_proj = theano.shared(p_indexes, name='x_proj', borrow=True)                                            
        else:
            temp_set_x, one_hot = expand_projection_inputs(temp_set_x, self.index_to_project, \
                                                            self.projection_insize)                                                                        
            shared_set_x_proj = self.make_shared(one_hot, 'x_proj')
            
        shared_set_x = self.make_shared(temp_set_x, 'x')
        shared_set_y = self.make_shared(temp_set_y, 'y')

        shared_set_xy = (shared_set_x, shared_set_x_proj, shared_set_y)

        if self.indexes_only:
            return shared_set_xy, temp_set_x, p_indexes, temp_set_y
        else:
            return shared_set_xy, temp_set_x, one_hot, temp_set_y
            
## Put this function at global level so it can be imported for use in dnn_generation
def expand_projection_inputs(temp_set_x, index_to_project, projection_insize):
    ## Turn indexes to words, syllables etc. to one-hot data:
    m,n = numpy.shape(temp_set_x)
    projection_indices = temp_set_x[:, index_to_project]
    #print projection_indices.tolist()
    assert projection_indices.max() < projection_insize,'projection_insize is %s but there is an index %s in the data'%(projection_insize, projection_indices.max())
    one_hot = numpy.zeros((m, projection_insize))
    
    ## Used advanced indexing to turn the relevant features on:
    projection_indices = projection_indices.astype(int) ## check conversion???!?!?!
    #     print projection_indices.tolist()
    #     print '            ^--- proj indices'
    #     print 
    one_hot[range(m),projection_indices] = 1.0
    ## Effectively remove the index from the original data by setting to 0:
    temp_set_x[:, index_to_project] = 0.0
    return temp_set_x, one_hot
    
def get_unexpanded_projection_inputs(temp_set_x, index_to_project, projection_insize):
    ## Turn indexes to words, syllables etc. to one-hot data:
    m,n = numpy.shape(temp_set_x)
    projection_indices = temp_set_x[:, index_to_project]
    #print projection_indices.tolist()
    assert projection_indices.max() < projection_insize,'projection_insize is %s but there is an index %s in the data'%(projection_insize, projection_indices.max())
    
    projection_indices = projection_indices.astype('int32') ## check conversion???!?!?!
    
    temp_set_x[:, index_to_project] = 0.0
    return temp_set_x, projection_indices