Source code for frontend.label_normalisation


import os
import numpy, re, sys
from multiprocessing import Pool
from io_funcs.binary_io import BinaryIOCollection
from linguistic_base import LinguisticBase

import lxml
from lxml import etree
from lxml.etree import * 
MODULE_PARSER = etree.XMLParser()

import logging
# from logplot.logging_plotting import LoggerPlotter #, MultipleTimeSeriesPlot, SingleWeightMatrixPlot

class LabelNormalisation(LinguisticBase):

    # this class only knows how to deal with a single style of labels (XML or HTS)
    # (to deal with composite labels, use LabelComposer instead)

    def __init__(self, question_file_name=None,xpath_file_name=None):
        pass
        
    def extract_linguistic_features(self, in_file_name, out_file_name=None, add_frame_features=False):
        A = self.load_labels_with_state_alignment(in_file_name, add_frame_features=add_frame_features)

        if out_file_name:
            io_funcs = BinaryIOCollection()
            io_funcs.array_to_binary_file(A, out_file_name)
        else:
            return A

#  -----------------------------



[docs]class HTSLabelNormalisation(LabelNormalisation):
    """This class is to convert HTS format labels into continous or binary values, and store as binary format with float32 precision.
    
    The class supports two kinds of questions: QS and CQS.
        **QS**: is the same as that used in HTS
        
        **CQS**: is the new defined question in the system.  Here is an example of the question: CQS C-Syl-Tone {_(\d+)+}. regular expression is used for continous values.
    
    Time alignments are expected in the HTS labels. Here is an example of the HTS labels:
    
    3050000 3100000 xx~#-p+l=i:1_4/A/0_0_0/B/1-1-4:1-1&1-4#1-3$1-4>0-1<0-1|i/C/1+1+3/D/0_0/E/content+1:1+3&1+2#0+1/F/content_1/G/0_0/H/4=3:1=1&L-L%/I/0_0/J/4+3-1[2]

    3100000 3150000 xx~#-p+l=i:1_4/A/0_0_0/B/1-1-4:1-1&1-4#1-3$1-4>0-1<0-1|i/C/1+1+3/D/0_0/E/content+1:1+3&1+2#0+1/F/content_1/G/0_0/H/4=3:1=1&L-L%/I/0_0/J/4+3-1[3]

    3150000 3250000 xx~#-p+l=i:1_4/A/0_0_0/B/1-1-4:1-1&1-4#1-3$1-4>0-1<0-1|i/C/1+1+3/D/0_0/E/content+1:1+3&1+2#0+1/F/content_1/G/0_0/H/4=3:1=1&L-L%/I/0_0/J/4+3-1[4]

    3250000 3350000 xx~#-p+l=i:1_4/A/0_0_0/B/1-1-4:1-1&1-4#1-3$1-4>0-1<0-1|i/C/1+1+3/D/0_0/E/content+1:1+3&1+2#0+1/F/content_1/G/0_0/H/4=3:1=1&L-L%/I/0_0/J/4+3-1[5]

    3350000 3900000 xx~#-p+l=i:1_4/A/0_0_0/B/1-1-4:1-1&1-4#1-3$1-4>0-1<0-1|i/C/1+1+3/D/0_0/E/content+1:1+3&1+2#0+1/F/content_1/G/0_0/H/4=3:1=1&L-L%/I/0_0/J/4+3-1[6]

    305000 310000 are the starting and ending time.
    [2], [3], [4], [5], [6] mean the HMM state index. 

    """
    
    # this subclass support HTS labels, which include time alignments
    
    def __init__(self, question_file_name=None, subphone_feats='full', continuous_flag=True):

        logger = logging.getLogger("labels")

        self.question_dict = {}
        self.ori_question_dict = {}
        self.dict_size = 0
        self.continuous_flag = continuous_flag
        try:    
#            self.question_dict, self.ori_question_dict = self.load_question_set(question_file_name)
            self.discrete_dict, self.continuous_dict = self.load_question_set_continous(question_file_name)
        except:
            logger.critical('error whilst loading HTS question set')
            raise
            
        ###self.dict_size = len(self.question_dict)
        
        self.dict_size = len(self.discrete_dict) + len(self.continuous_dict)
        self.subphone_feats = subphone_feats
        if self.subphone_feats == 'full':
            self.frame_feature_size = 9   ## zhizheng's original 5 state features + 4 phoneme features
        elif self.subphone_feats == 'state_only':
            self.frame_feature_size = 1   ## this is equivalent to a state-based system
        elif self.subphone_feats == 'minimal_frame':
            self.frame_feature_size = 2   ## the minimal features necessary to go from a state-level to frame-level model 
        else:
            sys.exit('Unknown value for subphone_feats: %s'%(subphone_feats))
        
        
        
        
        ##self.frame_feature_size = 9 
        ##question number + 5 state features + 4 phoneme features
        self.dimension = self.dict_size + self.frame_feature_size   
        
        ### if user wants to define their own input, simply set the question set to empty.
        if self.dict_size == 0:
            self.dimension = 0

        logger.debug('HTS-derived input feature dimension is %d + %d = %d' % (self.dict_size, self.frame_feature_size, self.dimension) )
        

#    def perform_normalisation(self, ori_file_list, output_file_list):
#        '''
#        converting discrete full context label to binary features and numerical features.
#        '''
#        utt_number = len(ori_file_list)
#        if utt_number != len(output_file_list):
#            print   "the number of input and output files should be the same!\n";
#            sys.exit(1)
                
#        for i in xrange(utt_number):
#            self.extract_linguistic_features(ori_file_list[i], output_file_list[i])
            

    def load_label_phone_alignment(self, file_name):

        # this is not currently used ???
        logger = logging.getLogger("labels")
        logger.critical('unused function ???')
        raise Exception
        
        assert self.dimension == self.dict_size+self.frame_feature_size
        
        # label_feature_matrix = numpy.empty((100000, self.dict_size+self.frame_feature_size))
        label_feature_matrix = numpy.empty((100000, self.dimension))

        label_feature_index = 0
        fid = open(file_name)
        for line in fid.readlines():
            line = line.strip()
            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)
            start_time = int(temp_list[0])
            end_time = int(temp_list[1])
            full_label = temp_list[2]

            # to do - support different frame shift - currently hardwired to 5msec
            frame_number = int((end_time - start_time)/50000)

            label_binary_vector = self.pattern_matching(full_label)
            current_block_binary_array = numpy.zeros((frame_number, self.dict_size+self.frame_feature_size))

            for i in xrange(frame_number):
                current_block_binary_array[i, 0:self.dict_size] = label_binary_vector
                current_block_binary_array[i, self.dict_size] = float(i+1)/float(frame_number)
                current_block_binary_array[i, self.dict_size+1] = float(frame_number - i)/float(frame_number)
                current_block_binary_array[i, self.dict_size+2] = float(frame_number)

            label_feature_matrix[label_feature_index:label_feature_index+frame_number,] = current_block_binary_array
            label_feature_index = label_feature_index + frame_number

        fid.close()

        label_feature_matrix = label_feature_matrix[0:label_feature_index,]

        return  label_feature_matrix



    def load_labels_with_state_alignment(self, file_name, add_frame_features=True): 
        ## add_frame_features not used in HTSLabelNormalisation -- only in XML version
 
        logger = logging.getLogger("labels")

        assert self.dimension == self.dict_size+self.frame_feature_size
        
        # label_feature_matrix = numpy.empty((100000, self.dict_size+self.frame_feature_size))
        label_feature_matrix = numpy.empty((100000, self.dimension))

        label_feature_index = 0

        state_number = 5

        lab_binary_vector = numpy.zeros((1, self.dict_size))
        fid = open(file_name)
        utt_labels = fid.readlines()
        fid.close()
        current_index = 0
        label_number = len(utt_labels)
        logger.info('loaded %s, %3d labels' % (file_name, label_number) )
		
        phone_duration = 0
        state_duration_base = 0
        for line in utt_labels:
            line = line.strip()
            
            if len(line) < 1:
                continue
            temp_list = re.split('\s+', line)
            start_time = int(temp_list[0])
            end_time = int(temp_list[1])
            full_label = temp_list[2]
            full_label_length = len(full_label) - 3  # remove state information [k]
            state_index = full_label[full_label_length + 1]
            
#            print state_index
            state_index = int(state_index) - 1
            state_index_backward = 6 - state_index
            full_label = full_label[0:full_label_length]

            frame_number = int((end_time - start_time)/50000)
            
            if state_index == 1:
                phone_duration = frame_number
                state_duration_base = 0
                
#                label_binary_vector = self.pattern_matching(full_label)
                label_binary_vector = self.pattern_matching_binary(full_label)

                # if there is no CQS question, the label_continuous_vector will become to empty
                label_continuous_vector = self.pattern_matching_continous_position(full_label) 
                label_vector = numpy.concatenate([label_binary_vector, label_continuous_vector], axis = 1)

                for i in xrange(state_number - 1):
                    line = utt_labels[current_index + i + 1].strip()
                    temp_list = re.split('\s+', line)
                    phone_duration += int((int(temp_list[1]) - int(temp_list[0]))/50000)

            current_block_binary_array = numpy.zeros((frame_number, self.dict_size+self.frame_feature_size))
            for i in xrange(frame_number):
                current_block_binary_array[i, 0:self.dict_size] = label_vector
		
                if self.subphone_feats == 'full':
                    ## Zhizheng's original 9 subphone features:
                    current_block_binary_array[i, self.dict_size] = float(i+1) / float(frame_number)   ## fraction through state (forwards)
                    current_block_binary_array[i, self.dict_size+1] = float(frame_number - i) / float(frame_number)  ## fraction through state (backwards)
                    current_block_binary_array[i, self.dict_size+2] = float(frame_number)  ## length of state in frames
                    current_block_binary_array[i, self.dict_size+3] = float(state_index)   ## state index (counting forwards)
                    current_block_binary_array[i, self.dict_size+4] = float(state_index_backward) ## state index (counting backwards)

                    current_block_binary_array[i, self.dict_size+5] = float(phone_duration)   ## length of phone in frames
                    current_block_binary_array[i, self.dict_size+6] = float(frame_number) / float(phone_duration)   ## fraction of the phone made up by current state
                    current_block_binary_array[i, self.dict_size+7] = float(phone_duration - i - state_duration_base) / float(phone_duration) ## fraction through phone (forwards)
                    current_block_binary_array[i, self.dict_size+8] = float(state_duration_base + i + 1) / float(phone_duration)  ## fraction through phone (backwards)
                    
                elif self.subphone_feats == 'state_only':
                    ## features which only distinguish state:
                    #current_block_binary_array[i, self.dict_size] = float(i+1) / float(frame_number)   ## fraction through state (forwards)
                    current_block_binary_array[i, self.dict_size] = float(state_index)   ## state index (counting forwards)
                    
                elif self.subphone_feats == 'minimal_frame':
                    ## features which distinguish state and minimally frame position in state:
                    current_block_binary_array[i, self.dict_size] = float(i+1) / float(frame_number)   ## fraction through state (forwards)
                    current_block_binary_array[i, self.dict_size+1] = float(state_index)   ## state index (counting forwards)
                else:
                    sys.exit('unknown subphone_feats type')


            label_feature_matrix[label_feature_index:label_feature_index+frame_number,] = current_block_binary_array
            label_feature_index = label_feature_index + frame_number

            state_duration_base += frame_number
            
            current_index += 1

        label_feature_matrix = label_feature_matrix[0:label_feature_index,]
        logger.debug('made label matrix of %d frames x %d labels' % label_feature_matrix.shape )
        return  label_feature_matrix


    ### this function is not used now
    def pattern_matching(self, label):
        # this function is where most time is spent during label preparation
        #
        # it might be possible to speed it up by using pre-compiled regular expressions?
        # (not trying this now, since we may change to to XML tree format for input instead of HTS labels)
        #
        label_size = len(label)

        lab_binary_vector = numpy.zeros((1, self.dict_size))

        for i in xrange(self.dict_size):
            current_question_list = self.question_dict[str(i)]
            binary_flag = 0
            for iq in xrange(len(current_question_list)):
                current_question = current_question_list[iq]
                current_size = len(current_question)
                if current_question[0] == '*' and current_question[current_size-1] == '*':
                    temp_question = current_question[1:current_size-1]
                    for il in xrange(1, label_size-current_size+2):
                        if temp_question == label[il:il+current_size-2]:
                            binary_flag = 1
                elif current_question[current_size-1] != '*':
                    temp_question = current_question[1:current_size]
                    if temp_question == label[label_size-current_size+1:label_size]:
                        binary_flag = 1
                elif current_question[0] != '*':
                    temp_question = current_question[0:current_size-1]
                    if temp_question == label[0:current_size-1]:
                        binary_flag = 1
                if binary_flag == 1:
                    break
            lab_binary_vector[0, i] = binary_flag
        
        return  lab_binary_vector
        
    def pattern_matching_binary(self, label):
        
        dict_size = len(self.discrete_dict)
        lab_binary_vector = numpy.zeros((1, dict_size))
        
        for i in xrange(dict_size):
            current_question_list = self.discrete_dict[str(i)]
            binary_flag = 0
            for iq in xrange(len(current_question_list)):
                current_compiled = current_question_list[iq]
                
                ms = current_compiled.search(label)
                if ms is not None:
                    binary_flag = 1
                    break
            lab_binary_vector[0, i] = binary_flag
            
        return   lab_binary_vector
        

    def pattern_matching_continous_position(self, label):
        
        dict_size = len(self.continuous_dict)

        lab_continuous_vector = numpy.zeros((1, dict_size))
        
        for i in xrange(dict_size):
            continuous_value = -1.0

            current_compiled = self.continuous_dict[str(i)]
            
            ms = current_compiled.search(label)
            if ms is not None:
#                assert len(ms.group()) == 1
                continuous_value = ms.group(1)
            
            lab_continuous_vector[0, i] = continuous_value

        return  lab_continuous_vector
        
    def load_question_set(self, qs_file_name):
        fid = open(qs_file_name)
        question_index = 0
        question_dict = {}
        ori_question_dict = {}
        for line in fid.readlines():
            line = line.replace('\n', '')
            if len(line) > 5:
                temp_list = line.split('{')
                temp_line = temp_list[1]
                temp_list = temp_line.split('}')
                temp_line = temp_list[0]
                question_list = temp_line.split(',')
                question_dict[str(question_index)] = question_list
                ori_question_dict[str(question_index)] = line
                question_index += 1
        fid.close()

        logger = logging.getLogger("labels")
        logger.debug('loaded question set with %d questions' % len(question_dict))

        return  question_dict, ori_question_dict


    def load_question_set_continous(self, qs_file_name):
        
        logger = logging.getLogger("labels")
        
        fid = open(qs_file_name)
        binary_qs_index = 0
        continuous_qs_index = 0
        binary_dict = {}
        continuous_dict = {}
        
        for line in fid.readlines():
            line = line.replace('\n', '')
            
            if len(line) > 5:
                temp_list = line.split('{')
                temp_line = temp_list[1]
                temp_list = temp_line.split('}')
                temp_line = temp_list[0]
                temp_line = temp_line.strip()
                question_list = temp_line.split(',')
                            
                temp_list = line.split(' ')
                question_key = temp_list[1]
#                print   line
                if temp_list[0] == 'CQS':
                    assert len(question_list) == 1
                    processed_question = self.wildcards2regex(question_list[0], convert_number_pattern=True)
                    continuous_dict[str(continuous_qs_index)] = re.compile(processed_question) #save pre-compiled regular expression 
                    continuous_qs_index = continuous_qs_index + 1
                elif temp_list[0] == 'QS':
                    re_list = []
                    for temp_question in question_list:
                        processed_question = self.wildcards2regex(temp_question)
#                        print   processed_question
                        re_list.append(re.compile(processed_question))
                        
                    binary_dict[str(binary_qs_index)] = re_list
                    binary_qs_index = binary_qs_index + 1
                else:
                    logger.critical('The question set is not defined correctly: %s' %(line))
                    raise Exception
                
#                question_index = question_index + 1       
        return  binary_dict, continuous_dict


[docs]    def wildcards2regex(self, question, convert_number_pattern=False):
        """
        Convert HTK-style question into regular expression for searching labels.
        If convert_number_pattern, keep the following sequences unescaped for 
        extracting continuous values):
            (\d+)       -- handles digit without decimal point
            ([\d\.]+)   -- handles digits with and without decimal point
        """
        
        ## handle HTK wildcards (and lack of them) at ends of label:
        if '*' in question:
            if not question.startswith('*'):
                question = '\A' + question
            if not question.endswith('*'):
                question = question + '\Z'
        question = question.strip('*')
        question = re.escape(question)
        ## convert remaining HTK wildcards * and ? to equivalent regex:
        question = question.replace('\\*', '.*')
        question = question.replace('\\?', '.')

        if convert_number_pattern:
            question = question.replace('\\(\\\\d\\+\\)', '(\d+)')
            question = question.replace('\\(\\[\\\\d\\\\\\.\\]\\+\\)', '([\d.]+)')
        return question
                        
#  -----------------------------












class XMLLabelNormalisation(LabelNormalisation):
    
    # this subclass supports XML trees (from Ossian) with time alignments embedded as features of the nodes
        
    def __init__(self, xpath_file_name=None, xpath=None, mapper=None, get_frame_feats=False, target_nodes = "//segment", fill_missing_values=False, use_compiled_xpath=False, iterate_over_frames=False):

        logger = logging.getLogger("labels")

        # specify which nodes in the loaded XML trees will be the targets for the xpaths used to extract features
        self.target_nodes = target_nodes

        self.use_compiled_xpath = use_compiled_xpath
        self.iterate_over_frames = iterate_over_frames
        
        
        if self.use_compiled_xpath:
            ## osw -- compile these once per normaliser -- could do it only once??
            self.start_time_xpath = etree.XPath('./attribute::start')
            self.end_time_xpath   = etree.XPath('./attribute::end')
        else:
            # how to retreive timings
            self.start_time_xpath = './attribute::start'
            self.end_time_xpath   = './attribute::end'

        # to do - make this user-settable via the config file ?
        self.unseen_string='_UNSEEN_'

        self.xpath_dict = [] ## NB 'dict' is now list to ensure feature order
        
        # should rename this variable - it's the feature dimensionality (excluding frame-level features)
        self.dict_size = 0

        # behaviour regarding filling in values for missing frames
        self.fill_missing_values = fill_missing_values
        

        # can read XPATHs from a file OR accept a single XPATH (but not both).
        # In the case of a single XPath, this can be a string or compiled version.
        
        
        # xpath now a list of xpaths [(name, xpath),(name, xpath)...] and
        #      a same length list of mappers -- entries should be None where
        #      feature isn't to be mapped.
        if xpath_file_name:
            assert not xpath
            assert not self.use_compiled_xpath
            self.xpath_dict = self.load_xpath_set(xpath_file_name)
            
            # if we are using a list of XPATHs, then
            # the dictionary size will determine the number of features (excluding frame-level features)
            # each xpath expression will extract a single feature
            self.dict_size = len(self.xpath_dict)
            logger.debug('using XPATH list - feature dimension will be %d' % self.dict_size)
            
            # to be implemented later: using mapping when there is a list of XPATHs
            # for now, do not allow a mapper in this case
            assert not mapper
            
        elif xpath:    ## osw -- now a list
            assert type(xpath) == list,'xpath must be a list of xpath expressions'
            assert not xpath_file_name
            self.xpath_dict = [('dummy_name', xp) for xp in xpath] #   [('dummy_name', xpath)]
            
            self.mapper = [None] * len(self.xpath_dict)
            if mapper:
                self.mapper = mapper
            assert len(self.mapper) == len(self.xpath_dict)
            
            self.dict_size = 0
            for map in self.mapper:
            # if we are using a single XPATH, then the number of features (excluding frame-level features)
            # is either 1, or is determined by the mapper
                if map:
                # pick an arbitrary item in the mapper and measure the length of the feature vector it will provide
                    
                    self.dict_size += len(map.itervalues().next())
                    logger.debug('using mapping - increment feature dimension by %d' % self.dict_size)
                else:
                    self.dict_size += 1
                    logger.debug('no mapping - increment feature dimension by 1')
        
     
                
            
        else:
            logger.critical('must provide one or more XPATHs')
            raise Exception
            


        
        # no access to sub-phonetic alignments in Ossian trees ???
        self.frame_feature_size = 0
        self.dimension = self.dict_size + self.frame_feature_size   

        logger.debug('XPATH feature dimension is %d + %d = %d' % (self.dict_size, self.frame_feature_size, self.dimension) )

        
    def convert_time_to_frames(self,t):
        # time in XML trees is stored as milliseconds, each frame is 5msec (currently hardwired - must change!)
        return numpy.rint(t / 5)

    def convert_frames_to_time(self,f):
        # time in XML trees is stored as milliseconds, each frame is 5msec (currently hardwired - must change!)
        return numpy.rint(f * 5)

    def load_labels_with_state_alignment(self, file_name_or_descriptor, add_frame_features=False):

        logger = logging.getLogger("labels")

        logger.debug('extracting features using this dictionary of XPATHs:')
        logger.debug('%s' % self.xpath_dict)
        
        assert self.dimension == self.dict_size+self.frame_feature_size

        if add_frame_features:
            self.dimension += 1 ## TODO: remove hard coding

        label_feature_matrix = numpy.empty((100000, self.dimension))
        
        # fill the label_feature_matrix with a special value that we can test for later
        label_feature_matrix.fill(numpy.nan)
        
        # label_feature_index = 0
        # state_number = 5

        # each frame will be a vector of features
        lab_binary_vector = numpy.zeros((1, self.dict_size))




        # load XML format labels from file_name
        
        ## Set the UtteranceElement Element as a default element class
        ## (http://lxml.de/element_classes.html):
        # not yet sure if we need to do this ???
        # parser_lookup = etree.ElementDefaultClassLookup(element=UtteranceElement)

        ## Ensure pretty printing
        ## (http://lxml.de/FAQ.html#why-doesn-t-the-pretty-print-option-reformat-my-xml-output):
        parser = XMLParser(remove_blank_text=True)
        # parser.set_element_class_lookup(parser_lookup)

        if type(file_name_or_descriptor) == file:
            fid = file_name_or_descriptor
            # rewind the file - it may have been left open from a previous call to this function
            fid.seek(0)
        else:
            try:
                fid = open(file_name_or_descriptor)
            except IOError:
                logger.critical('failed to open file %s' % file_name_or_descriptor)
                raise

        try:    
            tree = parse(fid, parser)
        except lxml.etree.XMLSyntaxError:
            logger.critical('failed to parse file %s' % file_name_or_descriptor)
            raise
            
        if type(file_name_or_descriptor) != file:
            fid.close()

    
        # the target nodes in the XML tree
        # e.g., segments or sub-phonetic states
        # for each of these, we will extract features using the xpath expressions
        targets = tree.getroot().xpath(self.target_nodes)
        
        # make sure there are some targets
        if len(targets)==0:
            logger.critical('pattern %s matches no nodes of utterance %s' % (target_nodes, file_name_or_descriptor) )
            raise Exception

        # iterate over the target nodes (e.g., segments or states)
        label_number = len(targets)
        total_number_of_frames=-1
        frame_num = 0  # osw
        previous_end_time = 0 ## to check target nodes are contiguous segments 
        for node in targets:
            
            logger.debug('extracting features for node %s' % node)

            try:
                if self.use_compiled_xpath:
                    this_segment_start_time = int(self.start_time_xpath(node)[0])
                    this_segment_end_time   = int(self.end_time_xpath(node)[0])            
                else:
                    this_segment_start_time = int(node.xpath(self.start_time_xpath)[0])
                    this_segment_end_time   = int(node.xpath(self.end_time_xpath)[0])
                    logger.debug(' start time: %d    end time: %d' % (this_segment_start_time,this_segment_end_time))
                    assert this_segment_start_time == previous_end_time,'segments not contiguous'
                    previous_end_time = this_segment_end_time
            except:
                logger.critical('problem obtaining start or end time for: %s' % node)
                raise

            segment_data = []
            for ((name, path), map) in zip(self.xpath_dict, self.mapper):
                if self.use_compiled_xpath:
                    pathstring = path.path
                    logger.debug(' evaluating PRECOMPILED xpath %s' % pathstring)
                    try:
                
                        data = path(node)
                    except lxml.etree.XPathEvalError:
                        logger.critical('problem evaluating this precompiled XPATH: %s' % pathstring)
                        raise
                
                else:  
                    pathstring = path   
                    #print 'evaluating PLAIN xpath'         
                    logger.debug(' evaluating xpath %s' % pathstring)
                    try:
                        data = node.xpath(path)
                    except lxml.etree.XPathEvalError:
                        logger.critical('problem evaluating this XPATH: %s' % pathstring)
                        raise


                if data == []:
                    # this means that nothing was matched by the XPATH
                    # so we construct a default value here (following the method used within Ossian)
                    # this will happen (only?) beyond utterance boundaries, where padding is required

## OSW: for now, don't handle padding with padding attributes -- just use _NA_ 
## This means that vectors etc have to be handled with a mapper rather than having 
## their features contained within the XML trees.

#                     fragments = re.split("[/:@\(\)]+", pathstring)  
#                     attribute_name = fragments[-1]          
#                     path_for_padding='ancestor::utt[1]/attribute::%s_PADDING'%(attribute_name)
#                     data = node.xpath(path_for_padding)
                    data = []
                    if data == []:
                        ## No padding attribute was found, use the default _NA_:
#                         logger.warning('failed to find a padding value')
#                         logger.warning(' original XPATH: %s' % pathstring)
#                         logger.warning(' attribute_name: %s' % attribute_name)
#                         logger.warning('  padding XPATH: %s' % path_for_padding)
                        data = ["_NA_"]
                
                if type(data) == list:
                    # currently we do not support lists of features stored in the trees
                    # only single values (which may be returned as a list with a single entry)
                    try:
                        assert len(data) == 1
                    except AssertionError:
                        logger.critical('data extracted using XPATH %s is a list with too many (%d) elements' % (path,len(data)) )
                        raise
                else:
                    # make it into a list with a single entry, because everything after this point
                    # assumes that data is a list of items
                    data=[data]
                    
                # if we are using a mapping, apply it now
                if map != None:  ## 
                    try:
                        data = map[data[0]]
                    except KeyError:
                        data = map[self.unseen_string]
                    except:
                        logger.critical('failed to map %s using mapper %s' % (data,map) )
                        raise
                
                elif (type(data[0]) == lxml.etree._ElementStringResult) or (type(data[0]) == str) \
                                                                or type(data[0] == float): 
                    # any strings coming out of the tree (that have not been mapped already)
                    # must now be co-erced to a numerical value
                    # note that data will always be a list with one entry, by this point.
                    # osw: might already be float -- values returned e.g. by xpath('count(...)') are floats
                    
                    
                    try:
                        data[0] = float(data[0])
                    except ValueError:
                        logger.critical('could not convert %s to a numerical value - problem with tree or mapping?' % data[0])
                        raise
                else:
                    logger.critical('internal error - data of unsupported type %s with value %s' % (type(data[0]) , data[0]) )
                    raise
                    
                    
                # at this point, data is a list of values

                # quick sanity check - just on the first item in the list
                try:
                    assert type(data[0]) in [int, float, bool]
                except AssertionError:
                    logger.critical('data extracted using XPATH %s is of unsupported type %s' % (path,type(data[0])) )
                    raise
                
                logger.debug(' features were %s' % data)

                segment_data.extend(data)

            logger.debug(' stacked features were %s' % segment_data)

	        # we now have a numerical feature vector for the current segment
            # and all we need to do is insert those values into the corresponding frames
            # from the start time to the end time of that segment

            # there are potential problems here converting between times and frame
            # currently, we tick through time in whatever units the XML trees use (msec)
            # and so will write to the same frame many times (wasteful, but avoids skipping any frames)
            # TO DO - be more careful and test for skipped frames 
            
            state_length_ms = this_segment_end_time - this_segment_start_time 
            state_length_frame = self.convert_time_to_frames(state_length_ms)   
          
            if self.iterate_over_frames:         
                for t in xrange(int(state_length_frame)):

                    t_in_frames = frame_num

                    frame_num += 1
                    if add_frame_features:  ## osw
                        since_state_start = t / float(state_length_frame) ## just in one direction -- 2 is redundant
                        #frames_till_state_end = state_length_frame - frames_since_state_start
                        extended_data = segment_data + [state_length_frame, since_state_start]
                    else:
                        extended_data = segment_data
                    assert len(extended_data) == self.dimension,'%s %s'%(len(extended_data),self.dimension)
                    label_feature_matrix[t_in_frames,] = extended_data
                total_number_of_frames = max(total_number_of_frames,t_in_frames)
            else:
                ###             
                for t in xrange(this_segment_start_time,this_segment_end_time):
                    # add 1 to the time in milliseconds before converting (and rounding) to frames
                    # this is to ensure the last frame is written to
                    # there may still be some offset between times and frames - STILL TO VERIFY THIS IS CORRECT
                
                    ## osw: time stamps obtained from HVite, -o flag lets you change how
                    # stamps are made, I used the default (HTKBook):
                    # 'By default start times are set to the start time of the frame and 
                    # end times are set to the end time of the frame.'
                
                    t_in_frames = self.convert_time_to_frames(t+1)
                
                    if add_frame_features:  ## osw -- these features are NOT CORRECT: t_in_frames is 
                                    ## time since utt start, not state start
                        frames_since_state_start = t_in_frames
                        frames_till_state_end = state_length_frame - frames_since_state_start
                        extended_data = segment_data + [state_length_frame, frames_since_state_start, frames_till_state_end]
                    else:
                        extended_data = segment_data
                
                    assert len(extended_data) == self.dimension
                
                    label_feature_matrix[t_in_frames,] = extended_data

                    # print "wrote into frame %d (time=%d)" % (t_in_frames,t)

                total_number_of_frames = max(total_number_of_frames,t_in_frames)
                # print "num frames=",total_number_of_frames
                
        logger.debug('loaded %s, %3d elements matching %s' % (file_name_or_descriptor, label_number, self.target_nodes) )

        total_number_of_frames += 1  ## osw: wrote last frame to row with this index -- need to increment
                                     ## to catch this row in the slice that follows

        # trim the matrix to the correct size
        label_feature_matrix = label_feature_matrix[0:total_number_of_frames,]
        
        
        # optionally, set the values for any frames that were missed thus far
        # (this will be because they were missing in the XML tree for some reason)
        #
        # osw -- checked for contiguous segments above -- can skip this 
        if self.fill_missing_values and numpy.isnan(label_feature_matrix).any():
            # currently naive - just fills with _UNSEEN_ (if there is a mapper), or zero (if there is no mapper)
            if self.mapper:
                data = self.mapper[self.unseen_string]
                # replace entire row if any element in that row is nan
                nan_rows=numpy.isnan(label_feature_matrix).any(axis=1)
                logger.debug('before:\n%s' % label_feature_matrix[nan_rows])
                label_feature_matrix[nan_rows,] = data
                logger.debug('after:\n%s' % label_feature_matrix[nan_rows])
            else:
                data=0
                label_feature_matrix[numpy.isnan(label_feature_matrix)] = data

            logger.debug('XPATH %s left some frames undefined; filled them with %s' % (path,data) )

        
        try:
            assert not numpy.isnan(label_feature_matrix).any()
        except AssertionError:
            logger.critical('XPATH %s left some frames undefined' % path )
            # work out which frames this occurred in, then log that information
            l=label_feature_matrix.shape[0]
            frame_times = [self.convert_frames_to_time(f) for f in range(1,l+1)]
            # append frame times as the first column of the matrix, just for logging purposes
            debug_label_feature_matrix = numpy.hstack( (numpy.array(frame_times).reshape((l,1)) , label_feature_matrix) )
            logger.critical(' here are the problem frames with timings:\n%s' % debug_label_feature_matrix[numpy.isnan(debug_label_feature_matrix).any(axis=1)])
            raise
        
        logger.debug('made label matrix of %d frames x %d labels' % label_feature_matrix.shape )
        return label_feature_matrix


    def load_xpath_set(self, xpath_file_name):

        logger = logging.getLogger("labels")

        # logger.debug('Opening xpaths file %s' % xpath_file_name)

        try:
            fid = open(xpath_file_name)
        except IOError:
            logger.critical('failed to open XPATHs file %s' % xpath_file_name)
            raise
            
        # each line contains an xpath expression

        # example line in the xpath file
        # l_segment_vsm_d1 =      preceding::segment[1]/attribute::segment_vsm_d1

        # question_index = 0
        xpath_dict = []
        
        for line in fid.readlines():

            line = line.replace('\n', '')
            
            if line.startswith('#') or (not "=" in line):
                continue
            
            (question_name,xpath)=line.split('=',1)
            question_name=question_name.replace(' ','')
            xpath=xpath.replace(' ','')
            logger.debug('loaded question %s with XPATH %s' % (question_name,xpath))

            # store it in the dictionary
            xpath_dict.append((question_name, xpath))
                
        fid.close()

        logger.debug('loaded XPATH set with %d paths' % len(xpath_dict))

        return  xpath_dict


if __name__ == '__main__':
    
    qs_file_name = './questions_dnn.hed'
    
    print   qs_file_name
    
    ori_file_list = ['/group/project/dnn_tts/lstm/fresh/00001.lab']
    output_file_list = ['/group/project/dnn_tts/00001.lab']

    label_operater = HTSLabelNormalisation(qs_file_name)
    label_operater.perform_normalisation(ori_file_list, output_file_list)
    print   label_operater.dimension