private void UpdateCorrectionFromArc(TranslationResultBuilder builder, WordGraphArc arc, bool isPrefix, int alignmentColsToAddCount) { for (int i = 0; i < arc.Words.Count; i++) { builder.AppendWord(arc.Words[i], arc.WordConfidences[i], !isPrefix && arc.IsUnknown); } WordAlignmentMatrix alignment = arc.Alignment; if (alignmentColsToAddCount > 0) { var newAlignment = new WordAlignmentMatrix(alignment.RowCount, alignment.ColumnCount + alignmentColsToAddCount); for (int j = 0; j < alignment.ColumnCount; j++) { for (int i = 0; i < alignment.RowCount; i++) { newAlignment[i, alignmentColsToAddCount + j] = alignment[i, j]; } } alignment = newAlignment; } builder.MarkPhrase(arc.SourceSegmentRange, alignment); }
private void InitArcs() { for (int arcIndex = 0; arcIndex < _wordGraph.Arcs.Count; arcIndex++) { WordGraphArc arc = _wordGraph.Arcs[arcIndex]; // init ecm score info for each word of arc EcmScoreInfo prevEsi = _stateEcmScoreInfos[arc.PrevState]; var esis = new List <EcmScoreInfo>(); foreach (string word in arc.Words) { var esi = new EcmScoreInfo(); _ecm.SetupEsi(esi, prevEsi, word); esis.Add(esi); prevEsi = esi; } _arcEcmScoreInfos.Add(esis); // init best scores for the arc's successive state UpdateStateBestScores(arcIndex, 0); _statesInvolvedInArcs.Add(arc.PrevState); _statesInvolvedInArcs.Add(arc.NextState); } }
private void AddBestUncorrectedPrefixState(TranslationResultBuilder builder, int procPrefixPos, int state) { var arcs = new Stack <WordGraphArc>(); int curState = state; int curProcPrefixPos = procPrefixPos; while (curState != 0) { int arcIndex = _stateBestPrevArcs[curState][curProcPrefixPos]; WordGraphArc arc = _wordGraph.Arcs[arcIndex]; for (int i = arc.Words.Count - 1; i >= 0; i--) { IReadOnlyList <int> predPrefixWords = _arcEcmScoreInfos[arcIndex][i].GetLastInsPrefixWordFromEsi(); curProcPrefixPos = predPrefixWords[curProcPrefixPos]; } arcs.Push(arc); curState = arc.PrevState; } foreach (WordGraphArc arc in arcs) { UpdateCorrectionFromArc(builder, arc, true, 0); } }
private void UpdateStateBestScores(int arcIndex, int prefixDiffSize) { WordGraphArc arc = _wordGraph.Arcs[arcIndex]; List <EcmScoreInfo> arcEsis = _arcEcmScoreInfos[arcIndex]; EcmScoreInfo prevEsi = arcEsis.Count == 0 ? _stateEcmScoreInfos[arc.PrevState] : arcEsis[arcEsis.Count - 1]; double wordGraphScore = _stateWordGraphScores[arc.PrevState] + arc.Score; List <double> nextStateBestScores = _stateBestScores[arc.NextState]; List <int> nextStateBestPrevArcs = _stateBestPrevArcs[arc.NextState]; var positions = new List <int>(); int startPos = prefixDiffSize == 0 ? 0 : prevEsi.Scores.Count - prefixDiffSize; for (int i = startPos; i < prevEsi.Scores.Count; i++) { double newScore = (EcmWeight * -prevEsi.Scores[i]) + (WordGraphWeight * wordGraphScore); if (i == nextStateBestScores.Count || nextStateBestScores[i] < newScore) { AddOrReplace(nextStateBestScores, i, newScore); positions.Add(i); AddOrReplace(nextStateBestPrevArcs, i, arcIndex); } } _stateEcmScoreInfos[arc.NextState].UpdatePositions(prevEsi, positions); if (wordGraphScore > _stateWordGraphScores[arc.NextState]) { _stateWordGraphScores[arc.NextState] = wordGraphScore; } }
private void BuildCorrectionFromHypothesis(TranslationResultBuilder builder, string[] prefix, bool isLastWordComplete, Hypothesis hypothesis) { int uncorrectedPrefixLen; if (hypothesis.StartArcIndex == -1) { AddBestUncorrectedPrefixState(builder, prefix.Length, hypothesis.StartState); uncorrectedPrefixLen = builder.Words.Count; } else { AddBestUncorrectedPrefixSubState(builder, prefix.Length, hypothesis.StartArcIndex, hypothesis.StartArcWordIndex); WordGraphArc firstArc = _wordGraph.Arcs[hypothesis.StartArcIndex]; uncorrectedPrefixLen = builder.Words.Count - (firstArc.Words.Count - hypothesis.StartArcWordIndex) + 1; } int alignmentColsToAddCount = _ecm.CorrectPrefix(builder, uncorrectedPrefixLen, prefix, isLastWordComplete); foreach (WordGraphArc arc in hypothesis.Arcs) { UpdateCorrectionFromArc(builder, arc, false, alignmentColsToAddCount); alignmentColsToAddCount = 0; } }
private WordGraphConfidences ComputeWordGraphConfidences(WordGraph wordGraph) { double normalizationFactor = LogSpace.Zero; var backwardProbs = new double[wordGraph.Arcs.Count]; for (int i = wordGraph.Arcs.Count - 1; i >= 0; i--) { WordGraphArc arc = wordGraph.Arcs[i]; double sum = LogSpace.One; if (!wordGraph.FinalStates.Contains(arc.NextState)) { sum = LogSpace.Zero; foreach (int nextArcIndex in wordGraph.GetNextArcIndices(arc.NextState)) { WordGraphArc nextArc = wordGraph.Arcs[nextArcIndex]; sum = LogSpace.Add(sum, LogSpace.Multiple(nextArc.Score, backwardProbs[nextArcIndex])); } } backwardProbs[i] = sum; if (arc.PrevState == WordGraph.InitialState) { normalizationFactor = LogSpace.Add(normalizationFactor, LogSpace.Multiple(arc.Score, backwardProbs[i])); } } var rawWpps = new Dictionary <string, Dictionary <int, double> >(); var forwardProbs = new (double Prob, int Index)[wordGraph.Arcs.Count];
private void ComputePrevScores(int state, out double[] prevScores, out int[] stateBestPrevArcs) { if (IsEmpty) { prevScores = new double[0]; stateBestPrevArcs = new int[0]; return; } prevScores = Enumerable.Repeat(LogSpace.Zero, StateCount).ToArray(); stateBestPrevArcs = new int[StateCount]; if (state == InitialState) { prevScores[InitialState] = InitialStateScore; } else { prevScores[state] = 0; } var accessibleStates = new HashSet <int> { state }; for (int arcIndex = 0; arcIndex < Arcs.Count; arcIndex++) { WordGraphArc arc = Arcs[arcIndex]; if (accessibleStates.Contains(arc.PrevState)) { double score = LogSpace.Multiple(arc.Score, prevScores[arc.PrevState]); if (score > prevScores[arc.NextState]) { prevScores[arc.NextState] = score; stateBestPrevArcs[arc.NextState] = arcIndex; } accessibleStates.Add(arc.NextState); } else { if (!accessibleStates.Contains(arc.NextState)) { prevScores[arc.NextState] = LogSpace.Zero; } } } }
private void AddBestUncorrectedPrefixSubState(TranslationResultBuilder builder, int procPrefixPos, int arcIndex, int arcWordIndex) { WordGraphArc arc = _wordGraph.Arcs[arcIndex]; int curProcPrefixPos = procPrefixPos; for (int i = arcWordIndex; i >= 0; i--) { IReadOnlyList <int> predPrefixWords = _arcEcmScoreInfos[arcIndex][i].GetLastInsPrefixWordFromEsi(); curProcPrefixPos = predPrefixWords[curProcPrefixPos]; } AddBestUncorrectedPrefixState(builder, curProcPrefixPos, arc.PrevState); UpdateCorrectionFromArc(builder, arc, true, 0); }
public IEnumerable <WordGraphArc> GetBestPathFromFinalStateToState(int state) { double[] prevScores; int[] stateBestPredArcs; ComputePrevScores(state, out prevScores, out stateBestPredArcs); double bestFinalStateScore = LogSpace.Zero; int bestFinalState = 0; foreach (int finalState in _finalStates) { double score = prevScores[finalState]; if (bestFinalStateScore < score) { bestFinalState = finalState; bestFinalStateScore = score; } } if (!_finalStates.Contains(bestFinalState)) { yield break; } int curState = bestFinalState; bool end = false; while (!end) { if (curState == state) { end = true; } else { int arcIndex = stateBestPredArcs[curState]; WordGraphArc arc = Arcs[arcIndex]; yield return(arc); curState = arc.PrevState; } } }
private void GetSubStateHypotheses(PriorityQueue <Hypothesis> queue) { for (int arcIndex = 0; arcIndex < _wordGraph.Arcs.Count; arcIndex++) { WordGraphArc arc = _wordGraph.Arcs[arcIndex]; if (arc.Words.Count > 1 && !IsArcPruned(arc)) { double wordGraphScore = _stateWordGraphScores[arc.PrevState] + arc.Score; for (int i = 0; i < arc.Words.Count - 1; i++) { EcmScoreInfo esi = _arcEcmScoreInfos[arcIndex][i]; double score = (WordGraphWeight * wordGraphScore) + (EcmWeight * -esi.Scores[esi.Scores.Count - 1]) + (WordGraphWeight * _restScores[arc.NextState]); queue.Enqueue(new Hypothesis(score, arc.NextState, arcIndex, i)); } } } }
public IEnumerable <double> ComputeRestScores() { double[] restScores = Enumerable.Repeat(LogSpace.Zero, StateCount).ToArray(); foreach (int state in _finalStates) { restScores[state] = InitialStateScore; } for (int i = Arcs.Count - 1; i >= 0; i--) { WordGraphArc arc = Arcs[i]; double score = LogSpace.Multiple(arc.Score, restScores[arc.NextState]); if (score > restScores[arc.PrevState]) { restScores[arc.PrevState] = score; } } return(restScores); }
private void ProcessWordGraphForPrefixDiff(string[] prefixDiff, bool isLastWordComplete) { if (prefixDiff.Length == 0) { return; } if (!_wordGraph.IsEmpty) { EcmScoreInfo prevInitialEsi = _stateEcmScoreInfos[WordGraph.InitialState]; _ecm.ExtendInitialEsi(_stateEcmScoreInfos[WordGraph.InitialState], prevInitialEsi, prefixDiff); UpdateInitialStateBestScores(); } for (int arcIndex = 0; arcIndex < _wordGraph.Arcs.Count; arcIndex++) { WordGraphArc arc = _wordGraph.Arcs[arcIndex]; // update ecm score info for each word of arc EcmScoreInfo prevEsi = _stateEcmScoreInfos[arc.PrevState]; List <EcmScoreInfo> esis = _arcEcmScoreInfos[arcIndex]; while (esis.Count < arc.Words.Count) { esis.Add(new EcmScoreInfo()); } for (int i = 0; i < arc.Words.Count; i++) { EcmScoreInfo esi = esis[i]; _ecm.ExtendEsi(esi, prevEsi, arc.IsUnknown ? string.Empty : arc.Words[i], prefixDiff, isLastWordComplete); prevEsi = esi; } // update best scores for the arc's successive state UpdateStateBestScores(arcIndex, prefixDiff.Length); } }
private static void TranslateInteractively_Success(Assert assert) { var httpClient = new MockHttpClient(); var resultDto = new InteractiveTranslationResultDto { WordGraph = new WordGraphDto { InitialStateScore = -111.111f, FinalStates = new[] { 4 }, Arcs = new[] { new WordGraphArcDto { PrevState = 0, NextState = 1, Score = -11.11f, Words = new[] { "This", "is" }, Confidences = new[] { 0.4f, 0.5f }, SourceSegmentRange = new RangeDto { Start = 0, End = 2 }, IsUnknown = false, Alignment = new[] { new AlignedWordPairDto { SourceIndex = 0, TargetIndex = 0 }, new AlignedWordPairDto { SourceIndex = 1, TargetIndex = 1 } } }, new WordGraphArcDto { PrevState = 1, NextState = 2, Score = -22.22f, Words = new[] { "a" }, Confidences = new[] { 0.6f }, SourceSegmentRange = new RangeDto { Start = 2, End = 3 }, IsUnknown = false, Alignment = new[] { new AlignedWordPairDto { SourceIndex = 0, TargetIndex = 0 } } }, new WordGraphArcDto { PrevState = 2, NextState = 3, Score = 33.33f, Words = new[] { "prueba" }, Confidences = new[] { 0.0f }, SourceSegmentRange = new RangeDto { Start = 3, End = 4 }, IsUnknown = true, Alignment = new[] { new AlignedWordPairDto { SourceIndex = 0, TargetIndex = 0 } } }, new WordGraphArcDto { PrevState = 3, NextState = 4, Score = -44.44f, Words = new[] { "." }, Confidences = new[] { 0.7f }, SourceSegmentRange = new RangeDto { Start = 4, End = 5 }, IsUnknown = false, Alignment = new[] { new AlignedWordPairDto { SourceIndex = 0, TargetIndex = 0 } } } } }, RuleResult = new TranslationResultDto { Target = new[] { "Esto", "es", "una", "test", "." }, Confidences = new[] { 0.0f, 0.0f, 0.0f, 1.0f, 0.0f }, Sources = new[] { TranslationSources.None, TranslationSources.None, TranslationSources.None, TranslationSources.Transfer, TranslationSources.None }, Alignment = new[] { new AlignedWordPairDto { SourceIndex = 0, TargetIndex = 0 }, new AlignedWordPairDto { SourceIndex = 1, TargetIndex = 1 }, new AlignedWordPairDto { SourceIndex = 2, TargetIndex = 2 }, new AlignedWordPairDto { SourceIndex = 3, TargetIndex = 3 }, new AlignedWordPairDto { SourceIndex = 4, TargetIndex = 4 } } } }; httpClient.Requests.Add(new MockRequest { Method = HttpRequestMethod.Post, ResponseText = JsonConvert.SerializeObject(resultDto, RestClientBase.SerializerSettings) }); var engine = new TranslationEngine("http://localhost/", "project1", httpClient); Action done = assert.Async(); engine.TranslateInteractively("Esto es una prueba.", 0.2, session => { assert.NotEqual(session, null); WordGraph wordGraph = session.SmtWordGraph; assert.Equal(wordGraph.InitialStateScore, -111.111); assert.DeepEqual(wordGraph.FinalStates.ToArray(), new[] { 4 }); assert.Equal(wordGraph.Arcs.Count, 4); WordGraphArc arc = wordGraph.Arcs[0]; assert.Equal(arc.PrevState, 0); assert.Equal(arc.NextState, 1); assert.Equal(arc.Score, -11.11); assert.DeepEqual(arc.Words.ToArray(), new[] { "This", "is" }); assert.DeepEqual(arc.WordConfidences.ToArray(), new[] { 0.4, 0.5 }); assert.Equal(arc.SourceSegmentRange.Start, 0); assert.Equal(arc.SourceSegmentRange.End, 2); assert.Equal(arc.IsUnknown, false); assert.Equal(arc.Alignment[0, 0], AlignmentType.Aligned); assert.Equal(arc.Alignment[1, 1], AlignmentType.Aligned); arc = wordGraph.Arcs[2]; assert.Equal(arc.IsUnknown, true); TranslationResult ruleResult = session.RuleResult; assert.DeepEqual(ruleResult.TargetSegment.ToArray(), new[] { "Esto", "es", "una", "test", "." }); assert.DeepEqual(ruleResult.WordConfidences.ToArray(), new[] { 0.0, 0.0, 0.0, 1.0, 0.0 }); assert.DeepEqual(ruleResult.WordSources.ToArray(), new[] { TranslationSources.None, TranslationSources.None, TranslationSources.None, TranslationSources.Transfer, TranslationSources.None }); assert.Equal(ruleResult.Alignment[0, 0], AlignmentType.Aligned); assert.Equal(ruleResult.Alignment[1, 1], AlignmentType.Aligned); assert.Equal(ruleResult.Alignment[2, 2], AlignmentType.Aligned); assert.Equal(ruleResult.Alignment[3, 3], AlignmentType.Aligned); assert.Equal(ruleResult.Alignment[4, 4], AlignmentType.Aligned); done(); }); }
/// <summary> /// Removes redundant arcs from the word graph. /// TODO: This seems to affect the results of an interactive translation session, so don't use it yet. /// </summary> /// <returns>The optimized word graph.</returns> public WordGraph Optimize() { var dfaArcs = new List <WordGraphArc>(); var dfaStates = new DfaStateCollection(); var dfaFinalStates = new HashSet <int>(); int nextDfaStateIndex = 1; var unmarkedStates = new Queue <DfaState>(); unmarkedStates.Enqueue(new DfaState(0, new[] { new NfaState(0) })); while (unmarkedStates.Count > 0) { DfaState dfaState = unmarkedStates.Dequeue(); var candidateArcs = new Dictionary <string, DfaArc>(); foreach ((int arcIndex, NfaState nfaState) in GetArcIndices(dfaState)) { WordGraphArc arc = Arcs[arcIndex]; int nextWordIndex = nfaState.WordIndex + 1; DfaArc candidateArc = candidateArcs.GetOrCreate(arc.Words[nextWordIndex]); if (nextWordIndex == arc.Words.Count - 1) { candidateArc.NfaStates.Add(new NfaState(arc.NextState)); Path path; if (dfaState.Paths.TryGetValue(nfaState.StateIndex, out Path prevPath)) { path = new Path(prevPath.StartState, prevPath.Arcs.Concat(arcIndex), LogSpace.Multiply(prevPath.Score, arc.Score)); } else { path = new Path(dfaState.Index, new[] { arcIndex }, arc.Score); } if (!candidateArc.Paths.TryGetValue(arc.NextState, out Path otherPath) || path.Score > otherPath.Score) { candidateArc.Paths[arc.NextState] = path; } } else { candidateArc.NfaStates.Add(new NfaState(nfaState.StateIndex, arcIndex, nextWordIndex)); candidateArc.IsNextSubState = true; if (dfaState.Paths.TryGetValue(nfaState.StateIndex, out Path prevPath)) { candidateArc.Paths[nfaState.StateIndex] = prevPath; } } } foreach (DfaArc candidateArc in candidateArcs.Values) { if (!dfaStates.TryGetValue(candidateArc.NfaStates, out DfaState nextDfaState)) { int stateIndex = candidateArc.IsNextSubState ? dfaState.Index : nextDfaStateIndex++; nextDfaState = new DfaState(stateIndex, candidateArc.NfaStates); if (candidateArc.IsNextSubState) { foreach (KeyValuePair <int, Path> kvp in candidateArc.Paths) { nextDfaState.Paths.Add(kvp); } } else { dfaStates.Add(nextDfaState); } unmarkedStates.Enqueue(nextDfaState); } bool isFinal = nextDfaState.NfaStates.Where(s => !s.IsSubState) .Any(s => FinalStates.Contains(s.StateIndex)); if ((isFinal || !candidateArc.IsNextSubState) && candidateArc.Paths.Count > 0) { Path bestPath = candidateArc.Paths.Values.MaxBy(p => p.Score); int curState = bestPath.StartState; for (int i = 0; i < bestPath.Arcs.Count; i++) { WordGraphArc nfaArc = Arcs[bestPath.Arcs[i]]; int nextState = !candidateArc.IsNextSubState && i == bestPath.Arcs.Count - 1 ? nextDfaState.Index : nextDfaStateIndex++; dfaArcs.Add(new WordGraphArc(curState, nextState, nfaArc.Score, nfaArc.Words, nfaArc.Alignment, nfaArc.SourceSegmentRange, nfaArc.IsUnknown, nfaArc.WordConfidences)); curState = nextState; } if (isFinal) { dfaFinalStates.Add(curState); } } } } return(new WordGraph(dfaArcs, dfaFinalStates, InitialStateScore)); }
private bool IsArcPruned(WordGraphArc arc) { return(!arc.IsUnknown && arc.WordConfidences.Any(c => c < ConfidenceThreshold)); }
private IEnumerable <Hypothesis> NBestSearch(int n, PriorityQueue <Hypothesis> queue) { var nbest = new List <Hypothesis>(); while (!queue.IsEmpty) { Hypothesis hypothesis = queue.Dequeue(); int lastState = hypothesis.Arcs.Count == 0 ? hypothesis.StartState : hypothesis.Arcs[hypothesis.Arcs.Count - 1].NextState; if (_wordGraph.FinalStates.Contains(lastState)) { nbest.Add(hypothesis); if (nbest.Count == n) { break; } } else if (ConfidenceThreshold <= 0) { hypothesis.Arcs.AddRange(_wordGraph.GetBestPathFromFinalStateToState(lastState).Reverse()); nbest.Add(hypothesis); if (nbest.Count == n) { break; } } else { double score = hypothesis.Score - (WordGraphWeight * _restScores[lastState]); IReadOnlyList <int> arcIndices = _wordGraph.GetNextArcIndices(lastState); bool enqueuedArc = false; for (int i = 0; i < arcIndices.Count; i++) { int arcIndex = arcIndices[i]; WordGraphArc arc = _wordGraph.Arcs[arcIndex]; if (IsArcPruned(arc)) { continue; } Hypothesis newHypothesis = hypothesis; if (i < arcIndices.Count - 1) { newHypothesis = newHypothesis.Clone(); } newHypothesis.Score = score; newHypothesis.Score += arc.Score; newHypothesis.Score += _restScores[arc.NextState]; newHypothesis.Arcs.Add(arc); queue.Enqueue(newHypothesis); enqueuedArc = true; } if (!enqueuedArc && (hypothesis.StartArcIndex != -1 || hypothesis.Arcs.Count > 0)) { hypothesis.Arcs.AddRange(_wordGraph.GetBestPathFromFinalStateToState(lastState).Reverse()); nbest.Add(hypothesis); if (nbest.Count == n) { break; } } } } return(nbest); }