/// <summary> /// Loads the candidate group from file. /// </summary> /// <param name="reader">The StreamReader of candidate group data file.</param> /// <param name="sentenceSet">Sentence set where to find the candidates.</param> public void Load(StreamReader reader, TrainingSentenceSet sentenceSet) { if (reader.EndOfStream) { throw new InvalidDataException("Unexpected end of stream"); } char[] splitter = new[] { ' ' }; string line = reader.ReadLine(); string[] columns = line.Split(splitter, StringSplitOptions.RemoveEmptyEntries); if (columns.Length != 3) { throw new InvalidDataException(Helper.NeutralFormat("Unsupported format here \"{0}\"", line)); } Name = columns[0]; Id = int.Parse(columns[1]); int count = int.Parse(columns[2]); while (count != 0) { if (reader.EndOfStream) { throw new InvalidDataException("Unexpected end of stream"); } line = reader.ReadLine(); if (string.IsNullOrEmpty(line)) { continue; } // e.g. the format should be // sentID IndexOfNonSilence FullContextLabel MustHoldFlag columns = line.Split(splitter, StringSplitOptions.RemoveEmptyEntries); if (columns.Length != 4) { throw new InvalidDataException(Helper.NeutralFormat("Unsupported format here \"{0}\"", line)); } UnitCandidate candidate = FindCandidate(sentenceSet, columns[0], int.Parse(columns[1]), columns[2]); if (bool.Parse(columns[3])) { candidate.MustHold = true; } _candidates.Add(candidate); --count; } }
/// <summary> /// Extracts features from the given script. /// </summary> /// <param name="script"> /// The xml script file. /// </param> /// <param name="fileListMap"> /// The file list map. /// </param> /// <param name="alignmentDir"> /// The alignment directory. /// </param> /// <param name="waveDir"> /// The wave directory. /// </param> /// <returns> /// The extracted features in training sentence set. /// </returns> /// <exception cref="ArgumentNullException"> /// Exception. /// </exception> public TrainingSentenceSet Extract(XmlScriptFile script, FileListMap fileListMap, string alignmentDir, string waveDir) { if (script == null) { throw new ArgumentNullException("script"); } if (fileListMap == null) { throw new ArgumentNullException("fileListMap"); } if (alignmentDir == null) { throw new ArgumentNullException("alignmentDir"); } if (waveDir == null) { throw new ArgumentNullException("waveDir"); } TrainingSentenceSet sentenceSet = new TrainingSentenceSet { FileListMap = fileListMap }; List<string> errList = new List<string>(); foreach (string sid in fileListMap.Map.Keys) { ScriptItem item = script.ItemDic[sid]; try { // Loads the segmentation file. SegmentFile segmentFile = new SegmentFile(); segmentFile.Load(fileListMap.BuildPath(alignmentDir, sid, "txt")); // Loads the waveform file to set the end time of the last segmentation. WaveFile waveFile = new WaveFile(); waveFile.Load(fileListMap.BuildPath(waveDir, sid, FileExtensions.Waveform)); segmentFile.WaveSegments[segmentFile.WaveSegments.Count - 1].EndTime = waveFile.Duration; // Extracts the single script item. Sentence sentence = Extract(item, segmentFile); sentence.TrainingSet = sentenceSet; sentenceSet.Sentences.Add(sid, sentence); } catch (Exception e) { if (!(e is InvalidDataException)) { throw; } // Removes the error sentences. Logger.Log(Helper.BuildExceptionMessage(e)); script.Remove(sid); errList.Add(sid); } } fileListMap.RemoveItems(errList); return sentenceSet; }
/// <summary> /// Initializes a new instance of the PreSelectionData class according to given forest and sentenceSet. /// </summary> /// <param name="forest">The given forest.</param> /// <param name="sentenceSet">The given sentence set where to find candiates.</param> /// <param name="fullFeatureNameSet">The full feature set to parse tree.</param> public PreSelectionData(DecisionForest forest, TrainingSentenceSet sentenceSet, LabelFeatureNameSet fullFeatureNameSet) { if (forest == null) { throw new ArgumentNullException("forest"); } if (sentenceSet == null) { throw new ArgumentNullException("sentenceSet"); } if (fullFeatureNameSet == null) { throw new ArgumentNullException("fullFeatureNameSet"); } _decisionForest = forest; _sentenceSet = sentenceSet; _nameIndexedCandidateGroup = new Dictionary<string, CandidateGroup>(); // Create empty candidate group. foreach (DecisionTree tree in forest.TreeList) { foreach (DecisionTreeNode node in tree.LeafNodeMap.Values) { CandidateGroup candidateGroup = new CandidateGroup { Name = node.Name, Id = _nameIndexedCandidateGroup.Count }; _nameIndexedCandidateGroup.Add(candidateGroup.Name, candidateGroup); } } // Travel the training sentence set to find the corresponding candidates. foreach (Sentence sentence in sentenceSet.Sentences.Values) { foreach (UnitCandidate candidate in sentence.Candidates) { if (!candidate.SilenceCandidate) { candidate.Label.FeatureNameSet = fullFeatureNameSet; DecisionTree[] linkedDecisionTrees = forest.TreeList.Where(t => t.Name == candidate.Name).ToArray(); Debug.Assert(linkedDecisionTrees.Length == 1, Helper.NeutralFormat("Invalidated: More than 1 {0} Preselection tree are linked to unit {1}", linkedDecisionTrees.Length, candidate.Name)); DecisionTreeNode leafNode = DecisionForestExtension.FilterTree(linkedDecisionTrees[0].NodeList[0], forest.Questions, candidate.Label); Debug.Assert(leafNode != null, Helper.NeutralFormat("cannot find leaf node for candidate {0} in sentence {1}", candidate.Name, sentence.Id)); _nameIndexedCandidateGroup[leafNode.Name].Candidates.Add(candidate); } } } // Verify there is no empty candidate group. foreach (CandidateGroup candidateGroup in _nameIndexedCandidateGroup.Values) { if (candidateGroup.Candidates.Count <= 0) { throw new InvalidDataException( Helper.NeutralFormat("There is no candidate in candidate group \"{0}\"", candidateGroup.Name)); } } }
/// <summary> /// Loads the pre-selection data from text file. /// </summary> /// <param name="forestFile">The file name of decision forest.</param> /// <param name="candidateGroupFile">The file name of candidate group data.</param> /// <param name="sentenceSet">The given sentence set where to find candidates.</param> public void LoadFromText(string forestFile, string candidateGroupFile, TrainingSentenceSet sentenceSet) { _sentenceSet = sentenceSet; _decisionForest = new DecisionForest("pre-selection"); _decisionForest.Load(forestFile); using (StreamReader fileReader = new StreamReader(candidateGroupFile)) { while (!fileReader.EndOfStream) { CandidateGroup candidateGroup = new CandidateGroup(); candidateGroup.Load(fileReader, sentenceSet); _nameIndexedCandidateGroup.Add(candidateGroup.Name, candidateGroup); } } // Each leaf node must be in the candidate groups. int countOfLeafNodes = 0; foreach (DecisionTree tree in _decisionForest.TreeList) { countOfLeafNodes += tree.LeafNodeMap.Count; foreach (DecisionTreeNode node in tree.LeafNodeMap.Values) { if (!_nameIndexedCandidateGroup.ContainsKey(node.Name)) { throw new InvalidDataException( Helper.NeutralFormat("Mismatched between file \"{0}\" and \"{1}\"", forestFile, candidateGroupFile)); } } } // Ensure candidate id is continuous and starts with zero. List<int> expected = new List<int>(); for (int i = 0; i < _nameIndexedCandidateGroup.Count; ++i) { expected.Add(i); } if (!Helper.Compare(expected, _nameIndexedCandidateGroup.Select(pair => pair.Value.Id).ToArray(), true)) { throw new InvalidDataException("The candidate group id should be continuous and starts with zero"); } // The count of candidate group must be equal to the count of leaf nodes. if (countOfLeafNodes != _nameIndexedCandidateGroup.Count) { throw new InvalidDataException( Helper.NeutralFormat("Mismatched between file \"{0}\" and \"{1}\"", forestFile, candidateGroupFile)); } }
/// <summary> /// Converts label files to master label file. /// </summary> /// <param name="alignmentDir">The directory of alignment files.</param> /// <param name="mlfFileName">The name of target master label file.</param> public static void ConvertLabelFilesToMlf(string alignmentDir, string mlfFileName) { TrainingSentenceSet set = new TrainingSentenceSet(); foreach (string labelFile in Directory.GetFiles(alignmentDir, Helper.NeutralFormat("*.{0}", FileExtensions.LabelFile))) { Sentence sentence = new Sentence(); using (StreamReader sr = new StreamReader(labelFile)) { if (sentence.Load(sr)) { throw new InvalidDataException("Sentence end is not expected"); } } string id = Path.GetFileNameWithoutExtension(labelFile); set.Sentences.Add(id, sentence); } set.Save(mlfFileName, LabelTypeOptions.FullContext, LabelAlignOptions.StateAlign, true); }
/// <summary> /// Finds the corresponding candidate in the given sentence set. /// </summary> /// <param name="sentenceSet">The given sentence set.</param> /// <param name="sentId">The sentence id which contains the candidate.</param> /// <param name="indexOfNonSilence">The index of non-silence unit of the candidate.</param> /// <param name="label">The label of the candidate.</param> /// <returns>The corresponding candidate.</returns> private static UnitCandidate FindCandidate(TrainingSentenceSet sentenceSet, string sentId, int indexOfNonSilence, string label) { if (!sentenceSet.Sentences.ContainsKey(sentId)) { throw new InvalidDataException(Helper.NeutralFormat("Cannot find the sentence \"{0}\"", sentId)); } Sentence sentence = sentenceSet.Sentences[sentId]; UnitCandidate result = null; foreach (UnitCandidate candidate in sentence.Candidates) { if (candidate.IndexOfNonSilence == indexOfNonSilence) { result = candidate; break; } } if (result == null) { throw new InvalidDataException(Helper.NeutralFormat("Cannot find the candidate \"{0}:{1}\"", sentId, indexOfNonSilence)); } Label myLabel = new Label { Text = label }; if (result.Label.CentralPhoneme != myLabel.CentralPhoneme) { throw new InvalidDataException( Helper.NeutralFormat( "Mismatched full-context label, expected current phone \"{0}\" but \"{1}\"", result.Label.CentralPhoneme, myLabel.CentralPhoneme)); } return result; }
/// <summary> /// Converts the master label file to label files. /// </summary> /// <param name="mlfFileName">The name of target master label file.</param> /// <param name="alignmentDir">The directory of the alignment files.</param> /// <param name="alignOption">The given alignment data.</param> public static void ConvertMlfToLabelFiles(string mlfFileName, string alignmentDir, LabelAlignOptions alignOption) { TrainingSentenceSet set = new TrainingSentenceSet(); set.Load(mlfFileName); foreach (KeyValuePair<string, Sentence> pair in set.Sentences) { string labelFile = FileExtensions.AppendExtensionName(pair.Key, FileExtensions.LabelFile); using (StreamWriter sw = new StreamWriter(Path.Combine(alignmentDir, labelFile))) { pair.Value.Save(sw, LabelTypeOptions.FullContext, alignOption, true); } } }
/// <summary> /// Builds the file list and then save them to file. /// </summary> /// <param name="masterLabelFile">The file name of the master label file.</param> /// <param name="corpusPath">Path of the corpus for which file list will be built.</param> /// <param name="fileListFile">Path of the target fileList.</param> /// <param name="extension">Extension of the corpus.</param> public static void GenerateFileListFile(string masterLabelFile, string corpusPath, string fileListFile, string extension) { TrainingSentenceSet trainingSet = new TrainingSentenceSet(); trainingSet.Load(masterLabelFile); trainingSet.GenerateFileListFile(corpusPath, fileListFile, extension); }
/// <summary> /// Find duplicated full context labels, and put sentences with same full context labels in same piece. /// </summary> /// <param name="sentenceSet">Training sentence set.</param> /// <param name="currentSentences">Current sentences whose labels will be checked.</param> /// <param name="usedSentences">Sentences that have been checked.</param> /// <param name="fullContextLabeledSentenceIds">Map of full context label to sentence collection.</param> /// <param name="subSetSize">Max set size (sentence count).</param> private void TouchDuplicateLabels( TrainingSentenceSet sentenceSet, Dictionary<string, object> currentSentences, IDictionary<string, object> usedSentences, IDictionary<string, Collection<string>> fullContextLabeledSentenceIds, int subSetSize) { Dictionary<string, object> nextSentences = new Dictionary<string, object>(); foreach (string currentSentenceId in currentSentences.Keys) { if (!sentenceSet._idKeyedSentences.ContainsKey(currentSentenceId)) { sentenceSet._idKeyedSentences.Add(currentSentenceId, _idKeyedSentences[currentSentenceId]); usedSentences.Add(currentSentenceId, null); } foreach (PhoneSegment sourceCandidate in _idKeyedSentences[currentSentenceId].PhoneSegments) { foreach (string nextSentenceId in fullContextLabeledSentenceIds[sourceCandidate.FullContextLabel]) { if (!usedSentences.ContainsKey(nextSentenceId)) { sentenceSet._idKeyedSentences.Add(nextSentenceId, _idKeyedSentences[nextSentenceId]); usedSentences.Add(nextSentenceId, null); nextSentences.Add(nextSentenceId, null); } } } } if (nextSentences.Keys.Count > 0 && sentenceSet.Sentences.Count < subSetSize && usedSentences.Count < FileListMap.Map.Count) { TouchDuplicateLabels(sentenceSet, nextSentences, usedSentences, fullContextLabeledSentenceIds, subSetSize); } }
/// <summary> /// Splits the training set into several pieces, the sentence count in each piece will be less than the /// Given value. This function is usually used to split the training set to perform computation in parallel. /// Please notice: /// 1. There is no same label in each piece. /// 2. The count of training set will be even since the count of CPU is always even. /// 3. The sentences in each piece will be balance as possible. /// </summary> /// <param name="maxSentenceInPiece">Max sentence count in one piece.</param> /// <returns>Array of training sentence set.</returns> public TrainingSentenceSet[] LabelFreeSplit(int maxSentenceInPiece) { int pieceCount = CalculatePiecesNumber(maxSentenceInPiece); int subSetSize = (int)Math.Ceiling((double)Sentences.Count / pieceCount); // The key is the full-context label, and the value is the sentence collection that has the full-context label. Dictionary<string, Collection<string>> fullContextLabeledSentenceIds = new Dictionary<string, Collection<string>>(); foreach (Sentence sentence in Sentences.Values) { foreach (PhoneSegment candidate in sentence.PhoneSegments) { if (!fullContextLabeledSentenceIds.ContainsKey(candidate.FullContextLabel)) { fullContextLabeledSentenceIds.Add(candidate.FullContextLabel, new Collection<string>()); } fullContextLabeledSentenceIds[candidate.FullContextLabel].Add(candidate.Sentence.Id); } } TrainingSentenceSet[] trainingSets = new TrainingSentenceSet[pieceCount]; Dictionary<string, object> currentSentences = new Dictionary<string, object>(); Dictionary<string, object> usedSentences = new Dictionary<string, object>(); IEnumerator<KeyValuePair<string, Sentence>> mapSentenceEnum = _idKeyedSentences.GetEnumerator(); for (int pieceIndex = 0; pieceIndex < pieceCount; pieceIndex++) { trainingSets[pieceIndex] = new TrainingSentenceSet { FileListMap = FileListMap }; while (trainingSets[pieceIndex].Sentences.Count < subSetSize && usedSentences.Count < Sentences.Count) { if (!mapSentenceEnum.MoveNext()) { break; } if (!usedSentences.ContainsKey(mapSentenceEnum.Current.Key)) { currentSentences.Clear(); currentSentences.Add(mapSentenceEnum.Current.Key, null); TouchDuplicateLabels(trainingSets[pieceIndex], currentSentences, usedSentences, fullContextLabeledSentenceIds, subSetSize); } } } return trainingSets; }
/// <summary> /// Tag subset candidates id according to the its sentence and position. /// </summary> /// <param name="subset">Subset of training sentences.</param> public void TagSubsetIds(TrainingSentenceSet subset) { foreach (Sentence subsetSentence in subset.Sentences.Values) { foreach (UnitCandidate subsetCandidate in subsetSentence.Candidates) { var fullSetCandidate = GetCandidate(subsetSentence.Id, subsetCandidate.Index); if (fullSetCandidate == null) { throw new InvalidDataException("Invalid candidate info."); } subsetCandidate.Id = fullSetCandidate.Id; subsetCandidate.GlobalId = fullSetCandidate.GlobalId; } } }