/// <summary> /// Estimates the phoneme-stops in a specified word. Each phoneme gets /// a weight that will be interpreted as its duration within the word. /// </summary> /// <param name="ar"><see cref="OrthographicResult"/></param> static void phStops(OrthographicResult ar) { //logfile.Log("phStops()"); var stops = new List <decimal>(); decimal tally = 0; foreach (var phon in ar.Phons) { switch (phon) { // curious where 0100010 got these - intuition perhaps. // TODO: French etc. case "aa": case "ae": case "ah": case "ax": case "ay": case "b": case "eh": case "l": case "r": case "w": stops.Add(tally += 5); break; case "ao": case "aw": case "er": case "ey": case "ow": case "oy": case "uh": case "uw": stops.Add(tally += 6); break; default: stops.Add(tally += 3); break; } } if (stops.Count != 0) { decimal factor = (ar.Stop - ar.Start) / tally; int i = 0; for (; i != stops.Count - 1; ++i) { ar.phStops.Add(OrthographicResult.Truncate(stops[i] * factor) + ar.Start); } ar.phStops.Add(ar.Stop); // ensure the final phoneme-stop IS the word-stop. } }
void LoadTable(string pfe) { _ars_alt = new List <OrthographicResult>(); using (var fs = new FileStream(pfe, FileMode.Open, FileAccess.Read, FileShare.Read)) { var sr = new StreamReader(fs, Encoding.ASCII); OrthographicResult ar = null; string l; while ((l = sr.ReadLine()) != null && l != String.Empty) { // TODO: WARNING Do error checks -> string[] a = l.Split(DELI); if (Utility.isWordstart(a[0])) { if (ar != null) { _ars_alt.Add(ar); } ar = new OrthographicResult(); ar.Orthography = String.Empty; ar.Confi = 0f; ar.Level = String.Empty; ar.Phons = new List <string>(); ar.Phons.Add(a[1]); ar.Start = Decimal.Parse(a[2]); ar.Stop = Decimal.Parse(a[3]); ar.phStops.Add(Decimal.Parse(a[3])); } else { ar.Phons.Add(a[1]); ar.phStops.Add(Decimal.Parse(a[3])); } } if (ar != null) { _ars_alt.Add(ar); } sr.Close(); } if (_ars_alt.Count != 0) { AlternateData(); } }
/// <summary> /// Inserts any required silences in the phrase and estimates the /// phoneme-stops in each word. /// </summary> void Orthography() { #if DEBUG logfile.Log(); logfile.Log("Orthography() _generato= " + _generato); #endif List <OrthographicResult> ars = null; switch (_generato) { case Generator.Dictati: ars = _ars_def; break; case Generator.Dialogi: ars = _ars_enh; break; } OrthographicResult ar; decimal stop = 0; for (int i = 0; i != ars.Count; ++i) { if ((ar = ars[i]).Start > stop) // TODO: use a tolerance Eg. 10..15 millisec { #if DEBUG logfile.Log(". . insert silence"); #endif OrthographicResult sil = CreateSilence(); sil.Start = stop; sil.phStops.Add(sil.Stop = ar.Start); ars.Insert(i, sil); ++i; } #if DEBUG logfile.Log(". ar.Orthography= " + ar.Orthography); string phons = String.Empty; foreach (var phon in ar.Phons) { if (phons != String.Empty) { phons += " "; } phons += phon; } logfile.Log(". ar.Phons= " + phons); #endif phStops(ar); stop = ar.Stop; } }
/// <summary> /// Creates a silence. /// </summary> /// <returns>an <see cref="OrthographicResult"/> w/ blank Orthography</returns> static OrthographicResult CreateSilence() { var sil = new OrthographicResult(); sil.Orthography = String.Empty; sil.Phons = new List <string>(); sil.Phons.Add(StaticData.SIL); sil.Confi = 1f; sil.Level = String.Empty; return(sil); }
/// <summary> /// Builds a list of OrthographicResults from the edited DataTable. /// </summary> /// <param name="sender"></param> /// <param name="e"></param> void click_Accept(object sender, EventArgs e) { #if DEBUG logfile.Log(); logfile.Log("click_Accept()"); #endif // TODO: only if changed _f._ars_alt = new List <OrthographicResult>(); OrthographicResult result; bool decr; for (int r = 0; r != _dt.Rows.Count; ++r) { string pos = _dt.Rows[r][0] as String; // pos #if DEBUG logfile.Log(". _dt.Rows[" + r + "][0]= " + _dt.Rows[r][0]); #endif if (Utility.isWordstart(pos)) { decr = false; result = new OrthographicResult(); result.Orthography = String.Empty; result.Confi = 0f; result.Level = String.Empty; result.Phons = new List <string>(); #if DEBUG logfile.Log(". . _dt.Rows[" + r + "][1]= " + _dt.Rows[r][1]); logfile.Log(". . _dt.Rows[" + r + "][2]= " + _dt.Rows[r][2]); logfile.Log(". . _dt.Rows[" + r + "][3]= " + _dt.Rows[r][3]); #endif result.Phons.Add(_dt.Rows[r][1] as String); // phon result.Start = Decimal.Parse(_dt.Rows[r][2].ToString()); // start result.phStops.Add(Decimal.Parse(_dt.Rows[r][3].ToString())); // stop - 1st phon if (r != _dt.Rows.Count - 1) { decr = true; pos = _dt.Rows[++r][0] as String; while (!Utility.isWordstart(pos)) { #if DEBUG logfile.Log(". . . _dt.Rows[" + r + "][1]= " + _dt.Rows[r][1]); logfile.Log(". . . _dt.Rows[" + r + "][3]= " + _dt.Rows[r][3]); #endif result.Phons.Add(_dt.Rows[r][1] as String); // phon - 2+ result.phStops.Add(Decimal.Parse(_dt.Rows[r][3].ToString())); // stop - 2+ if (r == _dt.Rows.Count - 1) { break; } pos = _dt.Rows[++r][0] as String; } } #if DEBUG logfile.Log(". . _dt.Rows[" + r + "][3]= " + _dt.Rows[r][3]); #endif result.Stop = Decimal.Parse(_dt.Rows[r][3].ToString()); // stop - word _f._ars_alt.Add(result); if (decr) { --r; } } } if (_f._ars_alt.Count != 0) { _f.AlternateData(); } }
// ulong GetAudioStreamPositionSeconds(string pos) // { // ulong sec = UInt64.Parse(pos); // // sec /= 2uL; // bytes per sample (16-bit) // sec /= 44100; // samples per second // // return sec; // } /// <summary> /// Handles 'SpInProcRecoContext.Recognition' event. Fires as the final /// hypothesis for a phrase. Each word will be added to a list of /// 'OrthographicResult's for the phrase. /// WARNING: This can fire 2+ on the same file-stream causing the engine /// to drop/reset important variables like 'PhraseInfo.StartTime' and /// 'word.AudioStreamOffset' and 'word.AudioTimeOffset' /// TODO: a fact that is exceedingly annoying to try to compensate for. /// </summary> /// <param name="StreamNumber"></param> /// <param name="StreamPosition"></param> /// <param name="RecognitionType"></param> /// <param name="Result"></param> void rc_Recognition(int StreamNumber, object StreamPosition, SpeechRecognitionType RecognitionType, ISpeechRecoResult Result) { #if DEBUG logfile.Log(); logfile.Log("rc_Recognition() #" + StreamNumber + " StreamPosition= " + StreamPosition + " _generato= " + _generato); logfile.Log(". RecognitionType= " + RecognitionType); // <- standard. logfile.Log(". _phoneConverter.LanguageId= " + _phoneConverter.LanguageId); logfile.Log(". " + Result.PhraseInfo.GetText()); // (0, -1, true) logfile.Log(". _offset = " + _offset); logfile.Log(". PhraseInfo.AudioStreamPosition= " + Result.PhraseInfo.AudioStreamPosition); // logfile.Log(". . sec= " + GetAudioStreamPositionSeconds(Result.PhraseInfo.AudioStreamPosition.ToString())); logfile.Log(". PhraseInfo.AudioSizeBytes = " + Result.PhraseInfo.AudioSizeBytes); logfile.Log(". PhraseInfo.StartTime = " + Result.PhraseInfo.StartTime); logfile.Log(". PhraseInfo.AudioSizeTime = " + Result.PhraseInfo.AudioSizeTime); logfile.Log(". Result.PhraseInfo.Rule.Name= " + Result.PhraseInfo.Rule.Name); // <- blank. logfile.Log(". Result.PhraseInfo.Rule.Id= " + Result.PhraseInfo.Rule.Id); logfile.Log(". Result.PhraseInfo.Rule.EngineConfidence= " + Result.PhraseInfo.Rule.EngineConfidence); logfile.Log(". Result.PhraseInfo.Rule.Confidence= " + Result.PhraseInfo.Rule.Confidence); logfile.Log(". wordcount= " + Result.PhraseInfo.Elements.Count); #endif List <OrthographicResult> ars = null; switch (_generato) { case Generator.Dictati: ars = _ars_def; break; case Generator.Dialogi: ars = _ars_enh; break; } foreach (ISpeechPhraseElement word in Result.PhraseInfo.Elements) { #if DEBUG logfile.Log(". . word= " + word.DisplayText); logfile.Log(". . LexicalForm= " + word.LexicalForm); logfile.Log(". . DisplayAttributes= " + word.DisplayAttributes); logfile.Log(". . EngineConfidence= " + word.EngineConfidence); logfile.Log(". . ActualConfidence= " + word.ActualConfidence); var ids = (ushort[])word.Pronunciation; foreach (var id in ids) { logfile.Log(". . . PhoneId= " + id + " - " + _phoneConverter.IdToPhone(id)); } logfile.Log(". . word.AudioStreamOffset= " + word.AudioStreamOffset); logfile.Log(". . word.AudioSizeBytes = " + word.AudioSizeBytes); logfile.Log(". . word.AudioTimeOffset = " + word.AudioTimeOffset); logfile.Log(". . word.AudioSizeTime = " + word.AudioSizeTime); #endif var ar = new OrthographicResult(); ar.Orthography = word.DisplayText; string phons = _phoneConverter.IdToPhone(word.Pronunciation); // NOTE: object is a ushort or ushort[] ar.Phons = new List <string>(phons.Split(' ')); ar.Confi = word.EngineConfidence; ar.Level = word.ActualConfidence.ToString().Replace("SEC", String.Empty).Replace("Confidence", String.Empty); ar.Start = _offset + Utility.GarpstoSecs(word.AudioTimeOffset); ar.Stop = _offset + Utility.GarpstoSecs(word.AudioTimeOffset + word.AudioSizeTime); ars.Add(ar); } // NOTE: Recognition could be fired before the entire audiofile has // completed, which means it's going to fire again but the AudioTimeOffsets // will be completely borked obviously. So add this time-offset to any // second or subsequent Recognition event that happens on this stream _offset += Utility.GarpstoSecs(Result.PhraseInfo.AudioSizeTime); // TODO. is not accurate. if (_text == String.Empty) { ++Confidence_def_count; Confidence_def += Result.PhraseInfo.Rule.EngineConfidence; } #if DEBUG logfile.Log(); #endif }