// === FixSpeakerTags method === // The LongRunningRecognizeResponse does not put SpeakerTag values // on the words initially until it has completed the transcription. // At that point, it creates one more result that has the entire text // in its Transcript field, and a word array that contains // every word in the entire text. The SpeakerTag fields now contains values. // FixSpeakerTags moves the SpeakerTag values from the last result in the response // to the corresponding words in the initial results and then removes the final result. public static Transcribed_Dto FixSpeakerTags(Transcribed_Dto transcribed) { int resultCount = transcribed.Talks.Count; TranscribedTalk_Dto lastResult = transcribed.Talks[resultCount - 1]; TranscribedTalk_Dto nextToLastResult = transcribed.Talks[resultCount - 2]; int lastWordnum = lastResult.Words[^ 1].WordNum;
/* === TransformResponse.Simplify method === * We want to extract all the useful data from the response that comes back from the cloud. * But we don't want the superlous fields that make it more complicated to use. * * The raw response structure contains: * A single unnamed object with a "Results" array. * The "Results" array consists of unnamed objects, each containing: * "Alternatives" array, "ChannelTag" integer, "LanguageCode" string * The "Alternatives" arrays appear to always consists of a single unnamed object containing: * "Transcript" string, "Confidence" decimal, "Words" array * WHEN DOES THIS EVER CONSIST OF MORE THEN ONE ALTERNATIVE? * The "Words" array consists of unnamed objects containing: * "StartTime" object, "EndTime" object, "Word" object * The "StartTime" and "EndTime" objects both contain: * "Seconds" int, "Nanos" integer * The "Word" objects contain: * "Word" string, "Confidence" decimal, "SpeakerTag" integer * * The new structure contains: * A single unnamed object with a "Results" array. * The "Results" array consists of unnamed objects, each containing: * "Transcript" string, "Confidence" decimal, "Words" array and "WordCount" integer * The "Words" array consists of unnamed objects, eash containing: * "Word" string, "Confidence" decimal, "StartTime" integer, "EndTime integer, "speakerTag" integer, * and "WordNum" integer. * Both StartTime and EndTime integers are in milliseconds. * "WordCount" and "WordNum" are new fields added to help in fixing speaker tags, * but we leave them in the final structure for possible future use. */ public static Transcribed_Dto Simpify(RepeatedField <SpeechRecognitionResult> recogResults) { Transcribed_Dto transcript = new Transcribed_Dto(); int altCount = 0; int wordNum = 0; foreach (SpeechRecognitionResult recogResult in recogResults) { if (recogResult.Alternatives.Count > 1) { altCount++; Console.WriteLine($"ERROR: more than 1 alternative - result {altCount}"); } ; SpeechRecognitionAlternative recogAlt = recogResult.Alternatives[0]; TranscribedTalk_Dto result = new TranscribedTalk_Dto(recogAlt.Transcript, recogAlt.Confidence) { // The new "WordCount" field in Result is populated with the total word count. WordCount = recogAlt.Words.Count, }; Console.WriteLine($"Next result: {recogAlt.Words.Count} words"); foreach (var item in recogAlt.Words) { long startTime = item.StartTime.Seconds * 1000 + item.StartTime.Nanos / 1000000; long endTime = item.EndTime.Seconds * 1000 + item.EndTime.Nanos / 1000000; // The new "WordNum" field in RespWord is popluated with the sequencial "wordnum" wordNum++; result.Words.Add(new TranscribedWord_Dto(item.Word, item.Confidence, startTime, endTime, item.SpeakerTag, wordNum)); } transcript.Talks.Add(result); } return(transcript); }