/// <summary> /// Shift segment data with certain silence duration. /// </summary> /// <param name="silenceDuration">Silence duration in second.</param> /// <param name="sourceDir">Source segment directory.</param> /// <param name="targetDir">Target segment directory.</param> /// <returns>Data error set found.</returns> public static DataErrorSet ShiftSegmentFiles(float silenceDuration, string sourceDir, string targetDir) { DataErrorSet errorSet = new DataErrorSet(); SegmentFile sf = new SegmentFile(); Dictionary<string, string> sourceMap = Microsoft.Tts.Offline.FileListMap.Build(sourceDir, ".txt"); foreach (string sid in sourceMap.Keys) { string sourceFilePath = null; string dstFilePath = null; try { dstFilePath = Path.Combine(targetDir, sourceMap[sid] + ".txt"); if (File.Exists(dstFilePath)) { continue; } sourceFilePath = Path.Combine(sourceDir, sourceMap[sid] + ".txt"); Helper.EnsureFolderExistForFile(dstFilePath); sf.Load(sourceFilePath); sf.Shift(silenceDuration); sf.Save(dstFilePath); } catch (InvalidDataException ide) { errorSet.Errors.Add(new DataError(sourceFilePath, Helper.BuildExceptionMessage(ide), sid)); } } return errorSet; }
public TtsUtterance Build(ScriptItem item, SegmentFile segmentFile, bool buildAllWords, int subSentenceIndex) { Helper.ThrowIfNull(item); TtsUtterance utterance = new TtsUtterance(); int phoneIndex = 0; try { // Silence indicates a silence word. if (segmentFile != null && segmentFile.WaveSegments[phoneIndex].IsSilenceFeature) { phoneIndex += AppendSilenceWord(utterance, segmentFile.WaveSegments[phoneIndex].Label); } // Creates a words map for ToBI accent. Dictionary<ScriptWord, TtsWord> mapWords = new Dictionary<ScriptWord, TtsWord>(); int sentenceIndex = 0; foreach (ScriptSentence scriptSentence in item.Sentences) { // Only add certain sentence in the scriptItem. if (subSentenceIndex != -1 && sentenceIndex++ != subSentenceIndex) { continue; } // Treats unkown sentence type as declarative. if (scriptSentence.SentenceType != SentenceType.Unknown) { utterance.SentenceType = (TtsSentenceType)scriptSentence.SentenceType; } else { utterance.SentenceType = (TtsSentenceType)SentenceType.Declarative; } utterance.SentenceEmotionType = (EmotionmlCategory)scriptSentence.Emotion; // Converts each word in script sentence. foreach (ScriptWord scriptWord in scriptSentence.Words) { if (buildAllWords || scriptWord.IsPronouncableNormalWord) { phoneIndex += AppendNormalWord(utterance, scriptWord); // Adds into words map. mapWords.Add(scriptWord, utterance.Words[utterance.Words.Count - 1]); // Breaks if meets the end of the utterance. if (segmentFile != null && phoneIndex >= segmentFile.WaveSegments.Count) { break; } if (segmentFile != null && segmentFile.WaveSegments[phoneIndex].IsSilenceFeature) { phoneIndex += AppendSilenceWord(utterance, segmentFile.WaveSegments[phoneIndex].Label); } } else if (buildAllWords || (NeedPunctuation && scriptWord.WordType == WordType.Punctuation)) { phoneIndex += AppendPunctuationWord(utterance, scriptWord); } } } // Builds phone list. int[] pauseDurations = new int[(int)TtsPauseLevel.PAU_IDX_SENTENCE + 1]; Array.Clear(pauseDurations, 0, pauseDurations.Length); utterance.BuildPhoneList(Phoneme, pauseDurations, 0, 0); // Builds ToBI accent, which should be happened after phone list built. BuildToBIInformation(mapWords); // Builds phrase list. utterance.BuildPhraseList(); // Builds character list. utterance.BuildContextCharacters(); return utterance; } catch (EspException e) { throw new InvalidDataException( Helper.NeutralFormat("Build utterance error on sentence \"{0}\"", item.Id), e); } }
/// <summary> /// Extracts the features of the given script item. /// </summary> /// <param name="item"> /// The script item. /// </param> /// <param name="segmentFile"> /// The segmentation file. /// </param> /// <returns> /// The sentence contains all the features. /// </returns> /// <exception cref="InvalidDataException"> /// Exception. /// </exception> private Sentence Extract(ScriptItem item, SegmentFile segmentFile) { UtteranceBuilder builder = new UtteranceBuilder(PhoneSet, PosSet, Phoneme) { NeedPos = NeedPos, NeedToBI = NeedToBI, }; // Builds a utterance first. Sentence sentence = null; using (TtsUtterance utterance = builder.Build(item, segmentFile, false, -1)) { // Extract ToneIndex if the language is zh-CN if (Language.ZhCN == (Language)PhoneSet.Language) { ChineseToneIndexExtractor.Process(utterance, item); } if (UtteranceExtenders != null) { // Uses the utterance extender here. foreach (IUtteranceExtender extender in UtteranceExtenders) { extender.Process(utterance, item); } } // Creates a sentence to store all the features. sentence = Extract(item.Id, utterance); for (int i = 0; i < sentence.PhoneSegments.Count; ++i) { // Create candidates for each phoneme. sentence.PhoneSegments[i].StartTimeInSecond = (float)segmentFile.WaveSegments[i].StartTime; sentence.PhoneSegments[i].EndTimeInSecond = (float)segmentFile.WaveSegments[i].EndTime; } } return sentence; }
/// <summary> /// Extracts features from the given script. /// </summary> /// <param name="script"> /// The xml script file. /// </param> /// <param name="fileListMap"> /// The file list map. /// </param> /// <param name="alignmentDir"> /// The alignment directory. /// </param> /// <param name="waveDir"> /// The wave directory. /// </param> /// <returns> /// The extracted features in training sentence set. /// </returns> /// <exception cref="ArgumentNullException"> /// Exception. /// </exception> public TrainingSentenceSet Extract(XmlScriptFile script, FileListMap fileListMap, string alignmentDir, string waveDir) { if (script == null) { throw new ArgumentNullException("script"); } if (fileListMap == null) { throw new ArgumentNullException("fileListMap"); } if (alignmentDir == null) { throw new ArgumentNullException("alignmentDir"); } if (waveDir == null) { throw new ArgumentNullException("waveDir"); } TrainingSentenceSet sentenceSet = new TrainingSentenceSet { FileListMap = fileListMap }; List<string> errList = new List<string>(); foreach (string sid in fileListMap.Map.Keys) { ScriptItem item = script.ItemDic[sid]; try { // Loads the segmentation file. SegmentFile segmentFile = new SegmentFile(); segmentFile.Load(fileListMap.BuildPath(alignmentDir, sid, "txt")); // Loads the waveform file to set the end time of the last segmentation. WaveFile waveFile = new WaveFile(); waveFile.Load(fileListMap.BuildPath(waveDir, sid, FileExtensions.Waveform)); segmentFile.WaveSegments[segmentFile.WaveSegments.Count - 1].EndTime = waveFile.Duration; // Extracts the single script item. Sentence sentence = Extract(item, segmentFile); sentence.TrainingSet = sentenceSet; sentenceSet.Sentences.Add(sid, sentence); } catch (Exception e) { if (!(e is InvalidDataException)) { throw; } // Removes the error sentences. Logger.Log(Helper.BuildExceptionMessage(e)); script.Remove(sid); errList.Add(sid); } } fileListMap.RemoveItems(errList); return sentenceSet; }
/// <summary> /// Check data consistence between script item and segmentation file. /// </summary> /// <param name="script">Script file instance.</param> /// <param name="item">Script item.</param> /// <param name="fileMap">File list map.</param> /// <param name="segmentDir">Segment file directory.</param> /// <param name="errorSet">Data error set found.</param> /// <param name="phoneBasedSegment">Phone based alignment or unit based alignment.</param> public static void ValidateDataAlignment(ScriptFile script, ScriptItem item, FileListMap fileMap, string segmentDir, DataErrorSet errorSet, bool phoneBasedSegment) { string segmentFilePath = Path.Combine(segmentDir, fileMap.Map[item.Id] + ".txt"); SegmentFile segmentFile = new SegmentFile(); segmentFile.Load(segmentFilePath); if (segmentFile.WaveSegments.Count == 0) { string message = string.Format(CultureInfo.InvariantCulture, "There is no valid alignment data into alignment file."); errorSet.Errors.Add(new DataError(segmentFilePath, message, item.Id)); } else if (!segmentFile.WaveSegments[segmentFile.WaveSegments.Count - 1].IsSilencePhone) { string message = string.Format(CultureInfo.InvariantCulture, "The alignment file is invalid, for without silence segment at the end."); errorSet.Errors.Add(new DataError(segmentFilePath, message, item.Id)); } else if (!phoneBasedSegment && item.Units.Count != segmentFile.NonSilenceWaveSegments.Count) { string message = string.Format(CultureInfo.InvariantCulture, "script units {0} do not match with non-silence segments {1} in segmentation file.", item.Units.Count, segmentFile.NonSilenceWaveSegments.Count); errorSet.Errors.Add(new DataError(script.FilePath, message, item.Id)); } else if (phoneBasedSegment && item.GetPhones().Length != segmentFile.NonSilenceWaveSegments.Count) { string message = string.Format(CultureInfo.InvariantCulture, "script phones {0} do not match with non-silence segments {1} in segmentation file.", item.GetPhones().Length, segmentFile.NonSilenceWaveSegments.Count); errorSet.Errors.Add(new DataError(script.FilePath, message, item.Id)); } else { // go through each segments if (phoneBasedSegment) { string[] phones = item.GetPhones(); for (int i = 0; i < segmentFile.NonSilenceWaveSegments.Count; i++) { WaveSegment segment = segmentFile.NonSilenceWaveSegments[i]; if (segment.Label != phones[i]) { string message = string.Format(CultureInfo.InvariantCulture, "phone [{0}/{1}] at {2} does not match between script and segment.", WaveSegment.FormatLabel(phones[i]), segment.Label, i); errorSet.Errors.Add(new DataError(script.FilePath, message, item.Id)); } } } else { for (int i = 0; i < segmentFile.NonSilenceWaveSegments.Count; i++) { WaveSegment segment = segmentFile.NonSilenceWaveSegments[i]; TtsUnit unit = item.Units[i]; if (segment.Label != WaveSegment.FormatLabel(unit.MetaUnit.Name)) { string message = string.Format(CultureInfo.InvariantCulture, "units [{0}/{1}] at {2} do not match between script and segment.", WaveSegment.FormatLabel(unit.MetaUnit.Name), segment.Label, i); errorSet.Errors.Add(new DataError(script.FilePath, message, item.Id)); } } } } }
/// <summary> /// Extract acoustic features for a given sentence. /// </summary> /// <param name="writer">Stream writer to write acoustic features.</param> /// <param name="script">Script file instance.</param> /// <param name="sid">Sentence id.</param> /// <param name="fileMap">File list map.</param> /// <param name="segmentDir">Segmentation file directory.</param> /// <param name="wave16kDir">16k Hz waveform file directory.</param> /// <param name="epochDir">Epoch file directory.</param> private static void ExtractAcoustic(StreamWriter writer, ScriptFile script, string sid, FileListMap fileMap, string segmentDir, string wave16kDir, string epochDir) { ScriptItem scriptItem = script.Items[sid]; // find the absolute file paths for each kind data file string wave16kFilePath = Path.Combine(wave16kDir, fileMap.Map[scriptItem.Id] + ".wav"); string epochFilePath = Path.Combine(epochDir, fileMap.Map[scriptItem.Id] + ".epoch"); string segmentFilePath = Path.Combine(segmentDir, fileMap.Map[scriptItem.Id] + ".txt"); // load data files SegmentFile segFile = new SegmentFile(); segFile.Load(segmentFilePath); EggAcousticFeature eggFile = new EggAcousticFeature(); eggFile.LoadEpoch(epochFilePath); WaveAcousticFeature waveFile = new WaveAcousticFeature(); waveFile.Load(wave16kFilePath); // calculate acoustic features for each segments in the files int totalCount = segFile.NonSilenceWaveSegments.Count; if (scriptItem.Units.Count != totalCount) { string str1 = "Unit number mis-matched between sentence [{0}] in "; string str2 = "script file [{1}] and in the alignment file [{2}]. "; string str3 = "There are {3} units in script but {4} units in alignment."; string message = string.Format(CultureInfo.InvariantCulture, str1 + str2 + str3, sid, script.FilePath, segmentFilePath, scriptItem.Units.Count, totalCount); throw new InvalidDataException(message); } for (int i = 0; i < totalCount; i++) { // for each wave segment WaveSegment ws = segFile.NonSilenceWaveSegments[i]; // get unit sample scope int sampleOffset = (int)(ws.StartTime * waveFile.SamplesPerSecond); int sampleLength = (int)(ws.Duration * waveFile.SamplesPerSecond); int sampleEnd = sampleOffset + sampleLength; int epochOffset = 0; int epochEnd = 0; // calculate average pitch, pitch average float averagePitch, pitchRange; eggFile.GetPitchAndRange(sampleOffset, sampleLength, out averagePitch, out pitchRange); ws.AveragePitch = averagePitch; ws.PitchRange = pitchRange; // calculate root mean square, and before that ajust the segment alignment with // the epoch data epochOffset = eggFile.AdjustAlignment(ref sampleOffset); epochEnd = eggFile.AdjustAlignment(ref sampleEnd); if (epochOffset > epochEnd) { string info = string.Format(CultureInfo.InvariantCulture, "epochOffset[{0}] should not be bigger than epochEnd[{1}]", epochOffset, epochEnd); throw new InvalidDataException(info); } if (sampleEnd > waveFile.SampleNumber) { string str1 = "Mis-match found between alignment file [{0}] and waveform file [{1}], "; string str2 = "for the end sample of alignment is [{2}] but"; string str3 = " the total sample number of waveform file is [{3}]."; string info = string.Format(CultureInfo.InvariantCulture, str1 + str2 + str3, segmentFilePath, wave16kFilePath, epochEnd, waveFile.SampleNumber); throw new InvalidDataException(info); } ws.RootMeanSquare = waveFile.CalculateRms(sampleOffset, sampleEnd - sampleOffset); // calculate epoch int epoch16KCompressLength = EpochFile.CompressEpoch(eggFile.Epoch, epochOffset, epochEnd - epochOffset, null); int epoch8KCompressLength = EpochFile.CompressEpoch(eggFile.Epoch8k, epochOffset, epochEnd - epochOffset, null); // leave (epoch offset in sentence) (epoch length) // (16k compressed epoch lenght) (8k compressed epoch lenght) as zero string message = string.Format(CultureInfo.InvariantCulture, "{0,12} {1,3} {2,9:0.000000} {3,9:0.000000} {4,7} {5,5} {6,4} {7,3} {8,3} {9,3} {10,7:0.0} {11,5:0.0} {12,4:0.0} {13}", scriptItem.Id, i, ws.StartTime, ws.Duration, sampleOffset, sampleEnd - sampleOffset, epochOffset, epochEnd - epochOffset, epoch16KCompressLength, epoch8KCompressLength, ws.RootMeanSquare, ws.AveragePitch, ws.PitchRange, scriptItem.Units[i].FullName); writer.WriteLine(message); } }
/// <summary> /// Validate alingment file. /// </summary> /// <param name="alignmentFile">Alignment file to validate.</param> /// <param name="builder">String builder for error message.</param> /// <returns>The position of the last silence alignment.</returns> private static int ValidateAlignmentFile(string alignmentFile, StringBuilder builder) { // sample position of the last silence alignment int lastSilenceAlign = 0; // validate the file present or not // and count the duration of the content // Validate alignment file existance if (!File.Exists(alignmentFile)) { builder.AppendFormat(CultureInfo.InvariantCulture, "Alignment file [{0}] does not exist.", alignmentFile); } else { SegmentFile segFile = new SegmentFile(); segFile.Load(alignmentFile); WaveSegment lastSeg = segFile.WaveSegments[segFile.WaveSegments.Count - 1]; if (lastSeg.IsSilencePhone) { // the last one should be silence of the segment file lastSilenceAlign = (int)(lastSeg.StartTime * 16000); } else { builder.AppendFormat(CultureInfo.InvariantCulture, "The ending segment of alignment file [{0}] is not silence.", alignmentFile); } } return lastSilenceAlign; }
/// <summary> /// Load MLF file. /// </summary> /// <param name="filePath">MLF file path.</param> /// <returns>Segment file dictionary, indexed by sentence id.</returns> public static Dictionary<string, SegmentFile> ReadAllDataFromMlf(string filePath) { Dictionary<string, SegmentFile> sfs = new Dictionary<string, SegmentFile>(); using (StreamReader sr = new StreamReader(filePath)) { string line = null; line = sr.ReadLine(); if (line != "#!MLF!#") { throw new InvalidDataException("Invalid file header " + filePath); } while ((line = sr.ReadLine()) != null) { // line should be sentence file path Match m = Regex.Match(line, @".*/(\S*)\."); if (!m.Success) { throw new InvalidDataException("Invalid format in file " + filePath + ", line " + line); } SegmentFile sf = new SegmentFile(); sf.FilePath = m.Groups[1].Value; sf.Load(sr); sfs.Add(sf.Id, sf); } } return sfs; }
/// <summary> /// Validate alingment file. /// </summary> /// <param name="alignmentFile">Alignment file to validate.</param> /// <param name="builder">String builder for error message.</param> /// <returns>The segment file.</returns> private static SegmentFile ValidateAlignmentFile(string alignmentFile, StringBuilder builder) { SegmentFile segmentFile = new SegmentFile(); try { segmentFile.Load(alignmentFile); } catch (FileNotFoundException) { builder.AppendFormat(CultureInfo.InvariantCulture, "Alignment file [{0}] does not exist.", alignmentFile); } catch (InvalidDataException e) { builder.Append(Helper.BuildExceptionMessage(e)); } return segmentFile; }
/// <summary> /// Gets the last silence align of the given segment file. /// </summary> /// <param name="segmentFile">The given segment file.</param> /// <param name="builder">The string builder for error message.</param> /// <returns>The last silence align.</returns> private static int GetLastSilenceAlign(SegmentFile segmentFile, StringBuilder builder) { int lastSilenceAlign = 0; WaveSegment lastSeg = segmentFile.WaveSegments[segmentFile.WaveSegments.Count - 1]; if (lastSeg.IsSilencePhone) { // The last one should be silence of the segment file lastSilenceAlign = (int)(lastSeg.StartTime * 16000); } else { builder.AppendFormat(CultureInfo.InvariantCulture, "The ending segment of alignment file [{0}] is not silence.", segmentFile.FilePath); } return lastSilenceAlign; }