Exemplo n.º 1
0
        private async Task <MSTResult> performRecognitionAsync(string logId, string filePath, SpeechTranslationConfig speechConfig, TimeSpan restartOffset,
                                                               string sourceLanguage, Dictionary <string, List <Caption> > captions, Dictionary <string, TimeSpan> startAfterMap)
        {
            using (var audioInput = WavHelper.OpenWavFile(filePath))
            {
                var      logOnce            = new HashSet <string>();
                var      stopRecognition    = new TaskCompletionSource <int>();
                bool     verboseLogging     = false;
                TimeSpan lastSuccessfulTime = TimeSpan.Zero;
                string   errorCode          = "";
                using (var recognizer = new TranslationRecognizer(speechConfig, audioInput))
                {
                    recognizer.Recognized += (s, e) =>
                    {
                        if (e.Result.Reason == ResultReason.TranslatedSpeech)
                        {
                            JObject jObject           = JObject.Parse(e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult));
                            var     wordLevelCaptions = jObject["Words"]
                                                        .ToObject <List <MSTWord> >()
                                                        .OrderBy(w => w.Offset)
                                                        .ToList();

                            if (e.Result.Text == "" && wordLevelCaptions.Count == 0)
                            {
                                if (verboseLogging)
                                {
                                    TimeSpan _offset = new TimeSpan(e.Result.OffsetInTicks);
                                    TimeSpan _end    = e.Result.Duration.Add(_offset);
                                    _logger.LogInformation($"{logId}: Empty String: Begin={_offset.Minutes}:{_offset.Seconds},{_offset.Milliseconds}, End={_end.Minutes}:{_end.Seconds},{_end.Milliseconds}");
                                }
                                return;
                            }

                            if (wordLevelCaptions.Any())
                            {
                                // TODO/TOREVIEW: Is this a bug fix or redefinition? Could this change in later versions of the SDK?
                                long offsetDifference = e.Result.OffsetInTicks - wordLevelCaptions.FirstOrDefault().Offset;

                                wordLevelCaptions.ForEach(w => w.Offset += offsetDifference);
                            }

                            var sentenceLevelCaptions = MSTWord.WordLevelTimingsToSentenceLevelTimings(e.Result.Text, wordLevelCaptions);

                            // Convert back to time in original untrimmed video.
                            // These timings are used to check if we should be adding any captions
                            // However they are then used direcly for sentence level translations
                            // but not for the word-level timings of the primary language
                            TimeSpan begin = (new TimeSpan(e.Result.OffsetInTicks)).Add(restartOffset);
                            TimeSpan end   = e.Result.Duration.Add(begin);

                            if (verboseLogging)
                            {
                                _logger.LogInformation($"{logId}: Begin={begin.Minutes}:{begin.Seconds},{begin.Milliseconds}", begin);
                                _logger.LogInformation($"{logId}: End={end.Minutes}:{end.Seconds},{end.Milliseconds}");
                            }
                            // TODO/TOREVIEW:
                            // ToCaptionEntitiesWithWordTiming vs ToCaptionEntitiesInterpolate
                            // Can this code be simplified to use a single function?
                            // Also: Caution - it is possible that word timing data from MS may depend on SDK version

                            var newCaptions = MSTWord.ToCaptionEntitiesWithWordTiming(captions[sourceLanguage].Count, restartOffset, sentenceLevelCaptions);

                            if (begin >= startAfterMap[sourceLanguage])
                            {
                                captions[sourceLanguage].AddRange(newCaptions);
                                if (logOnce.Add("AddingMain"))
                                {
                                    _logger.LogInformation($"{logId}: Adding Primary Language captions");
                                }
                            }
                            else
                            {
                                if (logOnce.Add("SkippingMain"))
                                {
                                    _logger.LogInformation($"{logId}: Skipping Main captions because {begin} < {startAfterMap[sourceLanguage]}");
                                }
                            }
                            foreach (var element in e.Result.Translations)
                            {
                                var language   = element.Key;
                                var startAfter = startAfterMap[language];
                                if (begin >= startAfter)
                                {
                                    // Translations dont have word level timing so
                                    // interpolate between start and end
                                    newCaptions = Caption.ToCaptionEntitiesInterpolate(captions[language].Count, begin, end, element.Value);
                                    captions[element.Key].AddRange(newCaptions);

                                    if (logOnce.Add($"AddingTranslated {language}"))
                                    {
                                        _logger.LogInformation($"{logId}: Adding translation ({language}) captions");
                                    }
                                }
                                else
                                {
                                    if (logOnce.Add($"SkippingTranslated {language}"))
                                    {
                                        _logger.LogInformation($"{logId}: Skipping ({language}) captions because {begin} < {startAfter}");
                                    }
                                }
                            }
                        }
                        else if (e.Result.Reason == ResultReason.NoMatch)
                        {
                            _logger.LogInformation($"{logId}: NOMATCH: Speech could not be recognized.");
                        }
                    };

                    recognizer.Canceled += (s, e) =>
                    {
                        errorCode = e.ErrorCode.ToString();
                        _logger.LogInformation($"{logId}: CANCELED: ErrorCode={e.ErrorCode} Reason={e.Reason}");

                        if (e.Reason == CancellationReason.Error)
                        {
                            _logger.LogError($"{logId}: CANCELED: ErrorCode={e.ErrorCode} Reason={e.Reason}");

                            if (e.ErrorCode == CancellationErrorCode.ServiceTimeout ||
                                e.ErrorCode == CancellationErrorCode.ServiceUnavailable ||
                                e.ErrorCode == CancellationErrorCode.ConnectionFailure)
                            {
                                TimeSpan lastTime = TimeSpan.Zero;
                                if (captions.Count != 0)
                                {
                                    var lastCaption = captions[sourceLanguage].OrderBy(c => c.End).TakeLast(1).ToList().First();
                                    lastTime = lastCaption.End;
                                }

                                _logger.LogInformation($"{logId}: Retrying, LastSuccessTime={lastTime}");
                                lastSuccessfulTime = lastTime;
                            }
                            else if (e.ErrorCode != CancellationErrorCode.NoError)
                            {
                                _logger.LogInformation($"{logId}: CANCELED: ErrorCode={e.ErrorCode} Reason={e.Reason}");
                                _slackLogger.PostErrorAsync(new Exception($"{logId}: Transcription Failure"),
                                                            "Transcription Failure").GetAwaiter().GetResult();
                            }
                        }

                        stopRecognition.TrySetResult(0);
                    };

                    recognizer.SessionStarted += (s, e) =>
                    {
                        _logger.LogInformation($"{logId}: Session started event.");
                    };

                    recognizer.SessionStopped += (s, e) =>
                    {
                        _logger.LogInformation($"{logId}: Session stopped event. Stopping recognition.");
                        stopRecognition.TrySetResult(0);
                    };

                    // Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
                    await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

                    // Waits for completion.
                    // Use Task.WaitAny to keep the task rooted.
                    Task.WaitAny(new[] { stopRecognition.Task });

                    // Stops recognition.
                    await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);

                    _logger.LogInformation($"{logId}: Returning {captions.Count} languages, ErrorCode = {errorCode}, LastSuccessTime = {lastSuccessfulTime}");

                    return(new MSTResult
                    {
                        Captions = captions,
                        ErrorCode = errorCode,
                        LastSuccessTime = lastSuccessfulTime
                    });
                }
            }
        }