public static WordAlignmentMatrix CreateAlignmentMatrix(this ParallelTextSegment segment, bool isUnknown = true) { if (segment.AlignedWordPairs == null) { return(null); } var matrix = new WordAlignmentMatrix(segment.SourceSegment.Count, segment.TargetSegment.Count, isUnknown ? AlignmentType.Unknown : AlignmentType.NotAligned); foreach (AlignedWordPair wordPair in segment.AlignedWordPairs) { matrix[wordPair.SourceIndex, wordPair.TargetIndex] = AlignmentType.Aligned; if (isUnknown) { for (int i = 0; i < segment.SourceSegment.Count; i++) { if (matrix[i, wordPair.TargetIndex] == AlignmentType.Unknown) { matrix[i, wordPair.TargetIndex] = AlignmentType.NotAligned; } } for (int j = 0; j < segment.TargetSegment.Count; j++) { if (matrix[wordPair.SourceIndex, j] == AlignmentType.Unknown) { matrix[wordPair.SourceIndex, j] = AlignmentType.NotAligned; } } } } return(matrix); }
public static string GetGizaFormatString(this ISegmentAligner aligner, ParallelTextSegment segment, Func <string, string> sourcePreprocessor = null, Func <string, string> targetPreprocessor = null) { IReadOnlyList <string> sourceSegment = segment.SourceSegment.Preprocess(sourcePreprocessor); IReadOnlyList <string> targetSegment = segment.TargetSegment.Preprocess(targetPreprocessor); WordAlignmentMatrix alignment = aligner.GetBestAlignment(sourceSegment, targetSegment, segment.CreateAlignmentMatrix()); return(alignment.ToGizaFormat(sourceSegment, targetSegment)); }
public static void AddSegmentPair(this IWordAlignmentModel model, ParallelTextSegment segment, Func <string, string> sourcePreprocessor = null, Func <string, string> targetPreprocessor = null) { if (segment.IsEmpty) { return; } IReadOnlyList <string> sourceSegment = segment.SourceSegment.Preprocess(sourcePreprocessor); IReadOnlyList <string> targetSegment = segment.TargetSegment.Preprocess(targetPreprocessor); model.AddSegmentPair(sourceSegment, targetSegment); }
public static void AddSegmentPair(this IWordAlignmentModel model, ParallelTextSegment segment, Func <string, string> sourcePreprocessor = null, Func <string, string> targetPreprocessor = null, bool isUnknown = true) { if (segment.IsEmpty) { return; } IReadOnlyList <string> sourceTokens = segment.SourceSegment.Preprocess(sourcePreprocessor); IReadOnlyList <string> targetTokens = segment.TargetSegment.Preprocess(targetPreprocessor); model.AddSegmentPair(sourceTokens, targetTokens, segment.CreateAlignmentMatrix(isUnknown)); }
public static string GetAlignmentString(this IWordAlignmentModel model, ParallelTextSegment segment, bool includeProbs, Func <string, string> sourcePreprocessor = null, Func <string, string> targetPreprocessor = null) { IReadOnlyList <string> sourceSegment = segment.SourceSegment.Preprocess(sourcePreprocessor); IReadOnlyList <string> targetSegment = segment.TargetSegment.Preprocess(targetPreprocessor); WordAlignmentMatrix alignment = model.GetBestAlignment(sourceSegment, targetSegment, segment.CreateAlignmentMatrix()); if (includeProbs) { return(alignment.ToString(model, sourceSegment, targetSegment)); } return(alignment.ToString()); }
public static WordAlignmentMatrix CreateAlignmentMatrix(this ParallelTextSegment segment) { if (segment.AlignedWordPairs == null) { return(null); } var matrix = new WordAlignmentMatrix(segment.SourceSegment.Count, segment.TargetSegment.Count); foreach (AlignedWordPair wordPair in segment.AlignedWordPairs) { matrix[wordPair.SourceIndex, wordPair.TargetIndex] = true; } return(matrix); }
private static bool IsSegmentValid(ParallelTextSegment segment) { return(!segment.IsEmpty && segment.SourceSegment.Count <= TranslationConstants.MaxSegmentLength && segment.TargetSegment.Count <= TranslationConstants.MaxSegmentLength); }
private void TestSegment(IInteractiveSmtEngine engine, ITranslationSuggester suggester, int n, ParallelTextSegment segment, StreamWriter traceWriter) { traceWriter?.WriteLine($"Segment: {segment.SegmentRef}"); IReadOnlyList <string> sourceSegment = segment.SourceSegment.Preprocess(Preprocessors.Lowercase); traceWriter?.WriteLine($"Source: {string.Join(" ", sourceSegment)}"); IReadOnlyList <string> targetSegment = segment.TargetSegment.Preprocess(Preprocessors.Lowercase); traceWriter?.WriteLine($"Target: {string.Join(" ", targetSegment)}"); traceWriter?.WriteLine(new string('=', 120)); string[][] prevSuggestionWords = null; bool isLastWordSuggestion = false; string suggestionResult = null; using (IInteractiveTranslationSession session = engine.TranslateInteractively(n, sourceSegment)) { while (session.Prefix.Count < targetSegment.Count || !session.IsLastWordComplete) { int targetIndex = session.Prefix.Count; if (!session.IsLastWordComplete) { targetIndex--; } bool match = false; TranslationSuggestion[] suggestions = suggester.GetSuggestions(session).ToArray(); string[][] suggestionWords = suggestions.Select((s, k) => s.TargetWordIndices.Select(j => session.CurrentResults[k].TargetSegment[j]).ToArray()).ToArray(); if (prevSuggestionWords == null || !SuggestionsAreEqual(prevSuggestionWords, suggestionWords)) { WritePrefix(traceWriter, suggestionResult, session.Prefix); WriteSuggestions(traceWriter, session, suggestions); suggestionResult = null; if (suggestions.Any(s => s.TargetWordIndices.Count > 0)) { _totalSuggestionCount++; } } for (int k = 0; k < suggestions.Length; k++) { TranslationSuggestion suggestion = suggestions[k]; var accepted = new List <int>(); for (int i = 0, j = targetIndex; i < suggestionWords[k].Length && j < targetSegment.Count; i++) { if (suggestionWords[k][i] == targetSegment[j]) { accepted.Add(suggestion.TargetWordIndices[i]); j++; } else if (accepted.Count == 0) { j = targetIndex; } else { break; } } if (accepted.Count > 0) { session.AppendSuggestionToPrefix(k, accepted); isLastWordSuggestion = true; _actionCount++; _totalAcceptedSuggestionCount++; if (accepted.Count == suggestion.TargetWordIndices.Count) { suggestionResult = "ACCEPT_FULL"; _fullSuggestionCount++; } else if (accepted[0] == suggestion.TargetWordIndices[0]) { suggestionResult = "ACCEPT_INIT"; _initSuggestionCount++; } else if (accepted[accepted.Count - 1] == suggestion.TargetWordIndices[suggestion.TargetWordIndices.Count - 1]) { suggestionResult = "ACCEPT_FIN"; _finalSuggestionCount++; } else { suggestionResult = "ACCEPT_MID"; _middleSuggestionCount++; } _acceptedSuggestionCounts[k]++; match = true; break; } } if (!match) { if (isLastWordSuggestion) { _actionCount++; isLastWordSuggestion = false; WritePrefix(traceWriter, suggestionResult, session.Prefix); suggestionResult = null; } int len = session.IsLastWordComplete ? 0 : session.Prefix[session.Prefix.Count - 1].Length; string targetWord = targetSegment[targetIndex]; if (len == targetWord.Length) { session.AppendToPrefix("", true); } else { string c = targetWord.Substring(len, 1); session.AppendToPrefix(c, false); } suggestionResult = suggestions.Any(s => s.TargetWordIndices.Count > 0) ? "REJECT" : "NONE"; _actionCount++; } prevSuggestionWords = suggestionWords; } WritePrefix(traceWriter, suggestionResult, session.Prefix); session.Approve(_approveAlignedOption.HasValue()); } _charCount += targetSegment.Sum(w => w.Length + 1); traceWriter?.WriteLine(); }
public static WordAlignmentMatrix GetBestAlignment(this ISegmentAligner aligner, ParallelTextSegment segment, Func <string, string> sourcePreprocessor = null, Func <string, string> targetPreprocessor = null, bool isUnknown = true) { IReadOnlyList <string> sourceTokens = segment.SourceSegment.Preprocess(sourcePreprocessor); IReadOnlyList <string> targetTokens = segment.TargetSegment.Preprocess(targetPreprocessor); return(aligner.GetBestAlignment(sourceTokens, targetTokens, segment.CreateAlignmentMatrix(isUnknown))); }