/// <summary> /// Adds a sentence into wave inventory. /// </summary> /// <param name="sentence">The given sentence.</param> /// <param name="waveFile">The corresponding wave form file.</param> private void Add(Sentence sentence, WaveFile waveFile) { Debug.Assert(waveFile.Format.SamplesPerSecond == _header.SamplesPerSecond && waveFile.Format.Channels == 1 && waveFile.Format.FormatTag == WaveFormatTag.Pcm, "Only supports source waveform with single channel, PCM and same sampling rate."); // Here, I change the original design. Original design is not save the wave data of pruned candidate, but it will introduce bug when current frame shifting // design happens, so I change the design as to save all wave data into inventory file, it will make .WVE data size increases 30%. It is fine for M1. // Consider more candidates will be pruned in M2, so we need a refactor on wave inventory creation module. To ensure minimum disk size as well as no bug. int firstValidIndex = sentence.Candidates.Count; for (int candIdx = 0; candIdx < sentence.Candidates.Count; candIdx++) { UnitCandidate candidate = sentence.Candidates[candIdx]; int waveSampleOffsetInSentence = (int)((candidate.StartTimeInSecond * waveFile.Format.SamplesPerSecond) + 0.5f); int waveSampleLength = (int)(((candidate.EndTimeInSecond - candidate.StartTimeInSecond) * waveFile.Format.SamplesPerSecond) + 0.5f); if (candidate.Id != UnitCandidate.InvalidId) { if (waveSampleLength > ushort.MaxValue) { throw new InvalidDataException(Helper.NeutralFormat( "The wave sample length of {0}-th candidate in file {1}.wav overflows.", candIdx, sentence.Id)); } WaveCandidateInfo candidateInfo = new WaveCandidateInfo { Name = candidate.Name, Id = candidate.Id, GlobalId = candidate.GlobalId, SentenceId = candidate.Sentence.Id, IndexOfNonSilence = (ushort)candidate.IndexOfNonSilence, FrameIndexInSentence = (ushort)candidate.StartFrame, FrameNumber = (ushort)(candidate.EndFrame - candidate.StartFrame), FrameIndex = (uint)(sentence.GlobalFrameIndex + candidate.StartFrame), }; if (firstValidIndex > candIdx && _indexingFile.SamplePerFrame == 0) { firstValidIndex = candIdx; if (candidateInfo.FrameNumber != 0) { _indexingFile.SamplePerFrame = (uint)(waveSampleLength / candidateInfo.FrameNumber); } } else { if (candidateInfo.FrameNumber != 0) { Debug.Assert(_indexingFile.SamplePerFrame == (uint)(waveSampleLength / candidateInfo.FrameNumber)); } } // calc left/right extensible margin, shift at most 1 units to ensure less than 1 unit. int leftMarginUnitIdx = Math.Max(0, candIdx - 1); int rightMarginUnitIdx = Math.Min(candIdx + 1, sentence.Candidates.Count - 1); int leftMarginFrame = candidate.StartFrame - sentence.Candidates[leftMarginUnitIdx].StartFrame; int rightMarginFrame = sentence.Candidates[rightMarginUnitIdx].EndFrame - candidate.EndFrame; Debug.Assert(leftMarginFrame >= 0 && rightMarginFrame >= 0); candidateInfo.LeftMarginInFrame = (byte)Math.Min(leftMarginFrame, MaxMarginInFrame); candidateInfo.RightMarginInFrame = (byte)Math.Min(rightMarginFrame, MaxMarginInFrame); // Writes the current candidate, throw exception if unit index alignment is inconsistent with wave inventory. long candidatePosition = candidateInfo.FrameIndex * // frame _millisecondPerFrame * // convert frame to millisecond (waveFile.Format.SamplesPerSecond / 1000) * // get samples per milliseconds (1s == 1000ms), convert millisecond to sample _header.BytesPerSample; // convert sample to byte long wavePosition = _writer.BaseStream.Position - _dataOffset; if (candidatePosition != wavePosition) { throw new InvalidDataException(Helper.NeutralFormat( "Frame {0} in sentence {1} starts at {2}, which is inconsistent with position in wave inventory {3}.\r\nPossible cause: bad MLF alignment.", candidateInfo.FrameIndexInSentence, candidateInfo.SentenceId, candidateInfo.FrameIndex, wavePosition)); } WriteIntoInventory(ConvertsWaveDataFormat(waveFile, waveSampleOffsetInSentence, waveSampleLength)); _indexingFile.Add(candidateInfo); } else { WriteIntoInventory(ConvertsWaveDataFormat(waveFile, waveSampleOffsetInSentence, waveSampleLength)); } } }
/// <summary> /// Adds a sentence into wave inventory. /// </summary> /// <param name="sentence">The given sentence.</param> /// <param name="waveFileName">The corresponding wave form file name.</param> public void Add(Sentence sentence, string waveFileName) { WaveFile waveFile = new WaveFile(); waveFile.Load(waveFileName); if (waveFile.Format.SamplesPerSecond != _header.SamplesPerSecond || waveFile.Format.Channels != 1 || waveFile.Format.FormatTag != WaveFormatTag.Pcm) { throw new NotSupportedException(Helper.NeutralFormat( "The waveform format of file [{0}] is not supported.", waveFileName)); } try { Add(sentence, waveFile); } catch (InvalidDataException e) { throw new InvalidDataException(Helper.NeutralFormat("It fails to process the file [{0}].", waveFileName), e); } }
/// <summary> /// Loads full-context label or mono align label from master label file. /// </summary> /// <param name="masterLabelFile">The file name of the master label file.</param> public void Load(string masterLabelFile) { using (StreamReader reader = new StreamReader(masterLabelFile)) { // This is the header of master label file. string line = reader.ReadLine(); if (line != MasterLabelFileHeader) { throw new InvalidDataException(Helper.NeutralFormat("Master label file header expected, but input \"{0}\"", line)); } while (!reader.EndOfStream) { // Read the line for sentence id. line = reader.ReadLine(); Match match = RegexOfSentId.Match(line); if (match.Success) { bool endOfSentenceExist; string sentId = match.Groups[1].Value; if (_idKeyedSentences.ContainsKey(sentId)) { // Not the first time to load the sentence. endOfSentenceExist = _idKeyedSentences[sentId].Load(reader); } else { // Load the sentence in first time. Sentence sentence = new Sentence { Id = sentId, TrainingSet = this }; endOfSentenceExist = sentence.Load(reader); _idKeyedSentences[sentId] = sentence; } if (!endOfSentenceExist) { throw new InvalidDataException("Sentence end is expected"); } } else { throw new InvalidDataException(Helper.NeutralFormat("Sentence id expected, but input \"{0}\"", line)); } } } }
/// <summary> /// Extracts the features of the given utterance. /// </summary> /// <param name="sentId"> /// Sentence id. /// </param> /// <param name="utterance"> /// Service Provider utterance object. /// </param> /// <returns> /// The sentence contains all the features. /// </returns> /// <exception cref="InvalidDataException"> /// Exception. /// </exception> public Sentence Extract(string sentId, TtsUtterance utterance) { List<FeatureVector> vectors; try { // Then, extracts the features. vectors = ExtractionEngine.Extract(utterance, FeatureMetas); } catch (EspException e) { throw new InvalidDataException(Helper.NeutralFormat("Extract feature error on sentence \"{0}\"", sentId), e); } // Validates the extracted vectors. if (vectors.Count != FeatureMetas.Count) { throw new InvalidDataException( Helper.NeutralFormat("Length of result is mismatch on sentence \"{0}\"", sentId)); } for (int i = 0; i < vectors.Count; i++) { if (vectors[i].Count != utterance.Phones.Count) { throw new InvalidDataException( Helper.NeutralFormat("Length of vector is mismatch on sentence \"{0}\"", sentId)); } } // Creates a sentence to store all the features. Sentence sentence = new Sentence { Id = sentId }; for (int i = 0; i < vectors[0].Count; ++i) { // Create candidates for each phoneme. PhoneSegment p = new PhoneSegment { Sentence = sentence, Index = i, Features = vectors.Select(v => v[i]) .Skip(LabelFeatureNameSet.MandatoryFeatureNames.Length).ToArray(), }; // Create the label to store the features. Label label = new Label(FeatureNameSet); for (int j = 0; j < vectors.Count; ++j) { if (vectors[j][i].ValueType == FeatureValueType.FEATURE_VALUE_TYPE_UNKOWN) { label.SetFeatureValue(FeatureNameSet.FeatureNames[j], Label.NotApplicableFeatureValue); } else if (FeatureMetas[j].Property == TtsFeatureProperty.TTS_FEATURE_PROPERTY_PHONE_ID) { Phone phone = PhoneSet.GetPhone(vectors[j][i].IntValue); label.SetFeatureValue(FeatureNameSet.FeatureNames[j], Offline.Phoneme.ToHtk(phone.Name)); } else { label.SetFeatureValue(FeatureNameSet.FeatureNames[j], vectors[j][i].IntValue.ToString(CultureInfo.InvariantCulture)); } // Updates the corresponding value records. FeatureValueRecords[j].Update(vectors[j][i]); } p.Label = label; sentence.PhoneSegments.Add(p); } return sentence; }
/// <summary> /// Converts label files to master label file. /// </summary> /// <param name="alignmentDir">The directory of alignment files.</param> /// <param name="mlfFileName">The name of target master label file.</param> public static void ConvertLabelFilesToMlf(string alignmentDir, string mlfFileName) { TrainingSentenceSet set = new TrainingSentenceSet(); foreach (string labelFile in Directory.GetFiles(alignmentDir, Helper.NeutralFormat("*.{0}", FileExtensions.LabelFile))) { Sentence sentence = new Sentence(); using (StreamReader sr = new StreamReader(labelFile)) { if (sentence.Load(sr)) { throw new InvalidDataException("Sentence end is not expected"); } } string id = Path.GetFileNameWithoutExtension(labelFile); set.Sentences.Add(id, sentence); } set.Save(mlfFileName, LabelTypeOptions.FullContext, LabelAlignOptions.StateAlign, true); }