private async Task <MSTResult> performRecognitionAsync(string logId, string filePath, SpeechTranslationConfig speechConfig, TimeSpan restartOffset, string sourceLanguage, Dictionary <string, List <Caption> > captions, Dictionary <string, TimeSpan> startAfterMap) { using (var audioInput = WavHelper.OpenWavFile(filePath)) { var logOnce = new HashSet <string>(); var stopRecognition = new TaskCompletionSource <int>(); bool verboseLogging = false; TimeSpan lastSuccessfulTime = TimeSpan.Zero; string errorCode = ""; using (var recognizer = new TranslationRecognizer(speechConfig, audioInput)) { recognizer.Recognized += (s, e) => { if (e.Result.Reason == ResultReason.TranslatedSpeech) { JObject jObject = JObject.Parse(e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult)); var wordLevelCaptions = jObject["Words"] .ToObject <List <MSTWord> >() .OrderBy(w => w.Offset) .ToList(); if (e.Result.Text == "" && wordLevelCaptions.Count == 0) { if (verboseLogging) { TimeSpan _offset = new TimeSpan(e.Result.OffsetInTicks); TimeSpan _end = e.Result.Duration.Add(_offset); _logger.LogInformation($"{logId}: Empty String: Begin={_offset.Minutes}:{_offset.Seconds},{_offset.Milliseconds}, End={_end.Minutes}:{_end.Seconds},{_end.Milliseconds}"); } return; } if (wordLevelCaptions.Any()) { // TODO/TOREVIEW: Is this a bug fix or redefinition? Could this change in later versions of the SDK? long offsetDifference = e.Result.OffsetInTicks - wordLevelCaptions.FirstOrDefault().Offset; wordLevelCaptions.ForEach(w => w.Offset += offsetDifference); } var sentenceLevelCaptions = MSTWord.WordLevelTimingsToSentenceLevelTimings(e.Result.Text, wordLevelCaptions); // Convert back to time in original untrimmed video. // These timings are used to check if we should be adding any captions // However they are then used direcly for sentence level translations // but not for the word-level timings of the primary language TimeSpan begin = (new TimeSpan(e.Result.OffsetInTicks)).Add(restartOffset); TimeSpan end = e.Result.Duration.Add(begin); if (verboseLogging) { _logger.LogInformation($"{logId}: Begin={begin.Minutes}:{begin.Seconds},{begin.Milliseconds}", begin); _logger.LogInformation($"{logId}: End={end.Minutes}:{end.Seconds},{end.Milliseconds}"); } // TODO/TOREVIEW: // ToCaptionEntitiesWithWordTiming vs ToCaptionEntitiesInterpolate // Can this code be simplified to use a single function? // Also: Caution - it is possible that word timing data from MS may depend on SDK version var newCaptions = MSTWord.ToCaptionEntitiesWithWordTiming(captions[sourceLanguage].Count, restartOffset, sentenceLevelCaptions); if (begin >= startAfterMap[sourceLanguage]) { captions[sourceLanguage].AddRange(newCaptions); if (logOnce.Add("AddingMain")) { _logger.LogInformation($"{logId}: Adding Primary Language captions"); } } else { if (logOnce.Add("SkippingMain")) { _logger.LogInformation($"{logId}: Skipping Main captions because {begin} < {startAfterMap[sourceLanguage]}"); } } foreach (var element in e.Result.Translations) { var language = element.Key; var startAfter = startAfterMap[language]; if (begin >= startAfter) { // Translations dont have word level timing so // interpolate between start and end newCaptions = Caption.ToCaptionEntitiesInterpolate(captions[language].Count, begin, end, element.Value); captions[element.Key].AddRange(newCaptions); if (logOnce.Add($"AddingTranslated {language}")) { _logger.LogInformation($"{logId}: Adding translation ({language}) captions"); } } else { if (logOnce.Add($"SkippingTranslated {language}")) { _logger.LogInformation($"{logId}: Skipping ({language}) captions because {begin} < {startAfter}"); } } } } else if (e.Result.Reason == ResultReason.NoMatch) { _logger.LogInformation($"{logId}: NOMATCH: Speech could not be recognized."); } }; recognizer.Canceled += (s, e) => { errorCode = e.ErrorCode.ToString(); _logger.LogInformation($"{logId}: CANCELED: ErrorCode={e.ErrorCode} Reason={e.Reason}"); if (e.Reason == CancellationReason.Error) { _logger.LogError($"{logId}: CANCELED: ErrorCode={e.ErrorCode} Reason={e.Reason}"); if (e.ErrorCode == CancellationErrorCode.ServiceTimeout || e.ErrorCode == CancellationErrorCode.ServiceUnavailable || e.ErrorCode == CancellationErrorCode.ConnectionFailure) { TimeSpan lastTime = TimeSpan.Zero; if (captions.Count != 0) { var lastCaption = captions[sourceLanguage].OrderBy(c => c.End).TakeLast(1).ToList().First(); lastTime = lastCaption.End; } _logger.LogInformation($"{logId}: Retrying, LastSuccessTime={lastTime}"); lastSuccessfulTime = lastTime; } else if (e.ErrorCode != CancellationErrorCode.NoError) { _logger.LogInformation($"{logId}: CANCELED: ErrorCode={e.ErrorCode} Reason={e.Reason}"); _slackLogger.PostErrorAsync(new Exception($"{logId}: Transcription Failure"), "Transcription Failure").GetAwaiter().GetResult(); } } stopRecognition.TrySetResult(0); }; recognizer.SessionStarted += (s, e) => { _logger.LogInformation($"{logId}: Session started event."); }; recognizer.SessionStopped += (s, e) => { _logger.LogInformation($"{logId}: Session stopped event. Stopping recognition."); stopRecognition.TrySetResult(0); }; // Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition. await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false); // Waits for completion. // Use Task.WaitAny to keep the task rooted. Task.WaitAny(new[] { stopRecognition.Task }); // Stops recognition. await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false); _logger.LogInformation($"{logId}: Returning {captions.Count} languages, ErrorCode = {errorCode}, LastSuccessTime = {lastSuccessfulTime}"); return(new MSTResult { Captions = captions, ErrorCode = errorCode, LastSuccessTime = lastSuccessfulTime }); } } }