/// <summary> /// Oredring the candidates for the ranking task, when the candidates are given. The ordering is somewhat different /// </summary> /// <param name="document">The document whose text is to be simplified</param> /// <param name="substitutionCandidates">Substitution candidates</param> /// <param name="target">Target word</param> /// <param name="contextSize">The size of the context of the target word to be compared semantically with candidate replacements</param> /// <returns></returns> public List <string> OrderGivenSubstitutionCandidates(Document document, List <string> substitutionCandidates, string target, int contextSize) { EngMorphology morph = new EngMorphology(); List <Tuple <TokenAnnotation, string> > substitutions = new List <Tuple <TokenAnnotation, string> >(); //List<string> metrics = new List<string> { "sim", "ic-diff", "context-sim", "length", "lm-bigram-pre", "lm-bigram-post", "lm-trigram-pre", "lm-trigram-post" }; List <string> metrics = new List <string> { "context-sim" /*, "lm-bigram-pre", "lm-bigram-post"*/, "ic-diff" /*, "lm-trigram-pre", "lm-trigram-post"*/ }; //List<string> metrics = new List<string> { "ic-diff" }; Dictionary <string, string> candidateChanges = new Dictionary <string, string>(); var targetToken = document.AllTokens.Where(t => t.Text == target).Last(); var preceedingSentencePart = document.Text.Substring(0, targetToken.StartPositionSentence); var followingSentencePart = document.Text.Substring(targetToken.StartPositionSentence + targetToken.Text.Length); var targetLemmaIC = InformationContent.GetRelativeInformationContent(targetToken.Lemma.ToLower()); var targetWordIC = InformationContent.GetRelativeInformationContent(targetToken.Text.ToLower()); var targetContextTokens = document.AllTokens.Where(t => Math.Abs(document.AllTokens.IndexOf(t) - targetToken.SentenceIndex) > 0 && Math.Abs(document.AllTokens.IndexOf(t) - targetToken.SentenceIndex) <= contextSize && t.IsContent()).ToList(); Dictionary <string, Dictionary <string, double> > scores = new Dictionary <string, Dictionary <string, double> >(); substitutionCandidates.ForEach(candidate => { var candidateText = candidate; if (candidateText.Contains(",")) { var splitCand = candidateText.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries).ToList(); string selected = string.Empty; double maxIC = double.MinValue; splitCand.ForEach(sc => { var scic = InformationContent.GetRelativeInformationContent(sc.Trim().ToLower()); if (scic > maxIC) { selected = sc.Trim(); maxIC = scic; } }); candidateText = selected; } if (candidateText.Trim().Contains(" ")) { var tokens = (new EngPOSTagger()).Annotate(candidateText.Trim()).ToList(); var contentTokens = tokens.Where(x => ((TokenAnnotation)x).IsContent()).ToList(); var change = contentTokens.Count > 0 ? ((TokenAnnotation)(contentTokens.Last())).Text.Trim() : ((TokenAnnotation)(tokens.First())).Text.Trim(); candidateText = change; } var candidateLemmaIC = InformationContent.GetRelativeInformationContent(candidateText.ToLower()); var candidateWordIC = !string.IsNullOrEmpty(candidateText) ? InformationContent.GetRelativeInformationContent(candidateText.ToLower()) : 1; var candidateIC = candidateWordIC == 1 ? candidateLemmaIC : candidateWordIC; var targetIC = targetWordIC == 1 ? targetLemmaIC : targetWordIC; var candidateContextSimilarities = targetContextTokens.Select(x => VectorSpace.Similarity(x.Lemma.ToLower(), candidateText.ToLower())).Where(x => x >= -1).ToList(); var candidateContextSimilarity = candidateContextSimilarities.Count > 0 ? candidateContextSimilarities.Average() : 0; scores.Add(candidate, new Dictionary <string, double>()); var sim = VectorSpace.Similarity(targetToken.Text.ToLower().Trim(), candidateText.ToLower().Trim()); if (sim < 1) { scores[candidate].Add("sim", sim); } scores[candidate].Add("ic-diff", candidateIC); scores[candidate].Add("context-sim", candidateContextSimilarity); scores[candidate].Add("length", candidateText.Length); var tokenIndex = document.AllTokens.IndexOf(targetToken); // bigram LM if (tokenIndex > 0) { var lmScore = EnglishLanguageModel.Instance.GetBigramLMScore(document.AllTokens[document.AllTokens.IndexOf(targetToken) - 1].Text.ToLower(), candidateText); scores[candidate].Add("lm-bigram-pre", lmScore.HasValue ? lmScore.Value : -100); } else { scores[candidate].Add("lm-bigram-pre", 0); } if (tokenIndex < document.AllTokens.Count - 1) { var lmScore = EnglishLanguageModel.Instance.GetBigramLMScore(candidateText, document.AllTokens[document.AllTokens.IndexOf(targetToken) + 1].Text.ToLower()); scores[candidate].Add("lm-bigram-post", lmScore.HasValue ? lmScore.Value : -100); } else { scores[candidate].Add("lm-bigram-pre", 0); } // trigram LM if (tokenIndex > 1) { var lmScore = EnglishLanguageModel.Instance.GetTrigramLMScore(document.AllTokens[document.AllTokens.IndexOf(targetToken) - 2].Text.ToLower(), document.AllTokens[document.AllTokens.IndexOf(targetToken) - 1].Text.ToLower(), candidateText); scores[candidate].Add("lm-trigram-pre", lmScore.HasValue ? lmScore.Value : -100); } else { scores[candidate].Add("lm-trigram-pre", 0); } if (tokenIndex < document.AllTokens.Count - 2) { var lmScore = EnglishLanguageModel.Instance.GetTrigramLMScore(candidateText, document.AllTokens[document.AllTokens.IndexOf(targetToken) + 1].Text.ToLower(), document.AllTokens[document.AllTokens.IndexOf(targetToken) + 2].Text.ToLower()); scores[candidate].Add("lm-trigram-post", lmScore.HasValue ? lmScore.Value : -100); } else { scores[candidate].Add("lm-trigram-post", 0); } }); LastSubstitutionCandidates = new List <Tuple <TokenAnnotation, List <string> > >(); LastSubstitutionCandidates.Add(new Tuple <TokenAnnotation, List <string> >(targetToken, scores.Select(x => x.Key).ToList())); var allRanks = new List <Dictionary <string, int> >(); metrics.ForEach(m => { var featDict = scores.Where(x => x.Value.ContainsKey(m)).ToDictionary(x => x.Key, x => x.Value[m]); allRanks.Add(TrainingExample.RankExamplesByNumericFeature(featDict, m == "length" || m == "ic-diff")); }); var allCandidates = scores.Select(x => x.Key).ToList(); Dictionary <string, double> averageRankings = allCandidates.ToDictionary(x => x, x => allRanks.Where(y => y.ContainsKey(x)).Select(r => r[x]).Average()); return(averageRankings.OrderBy(r => r.Value).Select(x => x.Key).ToList()); }
/// <summary> /// Choosing the substitutions for target words. This means we rank the candidates according to several features (similarity with target word, information content reduction, similarity with context words, ...) /// </summary> /// <param name="document">The document whose text is to be simplified</param> /// <param name="substCandidates">All the pairs of target words and collected candidate replacements</param> /// <param name="contextSize">The size of the context of the target word to be considered for measuring the similarity between candidate words and target word context</param> /// <param name="noSubstitutionWords">Stopwords, never to be considered for simplification</param> /// <param name="similarityTreshold">The treshold for semantic similarity between target word and candidate replacement</param> /// <param name="icReplacementCandidateTreshold">Information content treshold for replacing the target word</param> /// <param name="word"></param> /// <returns>The list of substitutions (tuple of target token and candidate replacement word)</returns> public List <Tuple <TokenAnnotation, string> > GetSubstitutions(Document document, List <Tuple <TokenAnnotation, List <Tuple <string, double> > > > substCandidates, int contextSize, List <string> noSubstitutionWords, double similarityTreshold, double icReplacementCandidateTreshold, string word = null) { EngMorphology morph = new EngMorphology(); List <Tuple <TokenAnnotation, string> > substitutions = new List <Tuple <TokenAnnotation, string> >(); List <string> metrics = new List <string> { "sim", "ic-diff", "context-sim", "lm-bigram-pre", "lm-bigram-post", "lm-trigram-pre", "lm-trigram-post" }; //List<string> metrics = new List<string> { "sim", "lm-bigram-pre", "lm-bigram-post", /*"ic-diff", "lm-trigram-pre", "lm-trigram-post"*/ }; substCandidates.ForEach(sc => { if (!noSubstitutionWords.Contains(sc.Item1.Text.ToLower())) { if (word == null || sc.Item1.Text.ToLower() == word) { Dictionary <string, Dictionary <string, double> > scores = new Dictionary <string, Dictionary <string, double> >(); var targetToken = sc.Item1; var sentence = document.Sentences.Where(s => s.Tokens.Any(t => t.StartPosition == targetToken.StartPosition && t.Text == targetToken.Text)).Single(); var targetTokenCopy = sentence.Tokens.Where(t => t.StartPosition == targetToken.StartPosition && t.Text == targetToken.Text).Single(); var preceedingSentencePart = sentence.Text.Substring(0, targetTokenCopy.StartPositionSentence); var followingSentencePart = sentence.Text.Substring(targetTokenCopy.StartPositionSentence + targetTokenCopy.Text.Length); var targetLemmaIC = InformationContent.GetRelativeInformationContent(targetToken.Lemma.ToLower()); var targetWordIC = InformationContent.GetRelativeInformationContent(targetToken.Text.ToLower()); var targetContextTokens = sentence.Tokens.Where(t => Math.Abs(sentence.Tokens.IndexOf(t) - targetTokenCopy.SentenceIndex) > 0 && Math.Abs(sentence.Tokens.IndexOf(t) - targetTokenCopy.SentenceIndex) <= contextSize && t.IsContent()).ToList(); var targetCtxtSimilarities = targetContextTokens.Select(x => VectorSpace.Similarity(x.Lemma.ToLower(), targetToken.Lemma.ToLower())).Where(x => x >= -1).ToList(); var targetContextSimilarity = targetCtxtSimilarities.Count > 0 ? targetCtxtSimilarities.Average() : 0; if (sc.Item2 != null) { sc.Item2.ForEach(candidate => { try { var candidateLemmaIC = InformationContent.GetRelativeInformationContent(candidate.Item1.ToLower()); string key = candidate.Item1 + "<->" + targetToken.POSTag; //var candidateInPOS = EngMorphology.GetForm(candidate.Item1, targetToken.POSTag); //if (!CandidateInPoSLookup.ContainsKey(key)) CandidateInPoSLookup.Add(key, candidateInPOS); var candidateInPOS = CandidateInPoSLookup.ContainsKey(key) ? CandidateInPoSLookup[key] : candidate.Item1; var candidateWordIC = !string.IsNullOrEmpty(candidateInPOS) ? InformationContent.GetRelativeInformationContent(candidateInPOS.ToLower()) : 1; var candidateIC = candidateWordIC == 1 ? candidateLemmaIC : candidateWordIC; var targetIC = targetWordIC == 1 ? targetLemmaIC : targetWordIC; if (!string.IsNullOrEmpty(candidateInPOS) && targetLemmaIC > icReplacementCandidateTreshold && (candidateIC < targetIC /*|| Math.Abs(targetIC - candidateIC) < 0.05*/)) { var artificialSentence = preceedingSentencePart + candidateInPOS + followingSentencePart; var artTokens = (new EngPOSTagger()).Annotate(artificialSentence).Select(x => (TokenAnnotation)x).ToList(); morph.AnnotateMorphology(artTokens); var candidateToken = artTokens.Where(x => x.StartPositionSentence == targetTokenCopy.StartPositionSentence /*&& x.Text == candidateInPOS*/).Single(); var candidateContextSimilarities = targetContextTokens.Select(x => VectorSpace.Similarity(x.Lemma.ToLower(), candidateToken.Lemma.ToLower())).Where(x => x >= -1).ToList(); var candidateContextSimilarity = candidateContextSimilarities.Count > 0 ? candidateContextSimilarities.Average() : targetContextSimilarity; // POS-tag compatibility is a second prerequisite bool sameWord = candidate.Item1.Contains(targetToken.Text) || targetToken.Text.Contains(candidate.Item1) || candidate.Item1.Contains(targetToken.Lemma) || targetToken.Lemma.Contains(candidate.Item1) || candidateInPOS.Contains(targetToken.Text) || targetToken.Text.Contains(candidateInPOS) || candidateInPOS.Contains(targetToken.Lemma) || targetToken.Lemma.Contains(candidateInPOS); bool sameAsContext = targetContextTokens.Any(ct => candidate.Item1.Contains(ct.Text) || ct.Text.Contains(candidate.Item1) || candidate.Item1.Contains(ct.Lemma) || ct.Lemma.Contains(candidate.Item1) || candidateInPOS.Contains(ct.Text) || ct.Text.Contains(candidateInPOS) || candidateInPOS.Contains(ct.Lemma) || ct.Lemma.Contains(candidateInPOS)); if (candidate.Item2 >= similarityTreshold && (candidateToken.POSTag == targetToken.POSTag) && !sameWord && !sameAsContext) { if (!scores.ContainsKey(candidateInPOS)) { scores.Add(candidateInPOS, new Dictionary <string, double>()); scores[candidateInPOS].Add("sim", candidate.Item2); scores[candidateInPOS].Add("ic-diff", targetIC - candidateIC); scores[candidateInPOS].Add("context-sim", candidateContextSimilarity); scores[candidateInPOS].Add("length", candidateInPOS.Length); var tokenIndex = sentence.Tokens.IndexOf(targetTokenCopy); // bigram LM if (tokenIndex > 0) { var lmScore = EnglishLanguageModel.Instance.GetBigramLMScore(sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) - 1].Text.ToLower(), candidateInPOS); scores[candidateInPOS].Add("lm-bigram-pre", lmScore.HasValue ? lmScore.Value : -100); } else { scores[candidateInPOS].Add("lm-bigram-pre", 0); } if (tokenIndex < sentence.Tokens.Count - 1) { var lmScore = EnglishLanguageModel.Instance.GetBigramLMScore(candidateInPOS, sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) + 1].Text.ToLower()); scores[candidateInPOS].Add("lm-bigram-post", lmScore.HasValue ? lmScore.Value : -100); } else { scores[candidateInPOS].Add("lm-bigram-post", 0); } // trigram LM if (tokenIndex > 1) { var lmScore = EnglishLanguageModel.Instance.GetTrigramLMScore(sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) - 2].Text.ToLower(), sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) - 1].Text.ToLower(), candidateInPOS); scores[candidateInPOS].Add("lm-trigram-pre", lmScore.HasValue ? lmScore.Value : -100); } else { scores[candidateInPOS].Add("lm-trigram-pre", 0); } if (tokenIndex < sentence.Tokens.Count - 2) { var lmScore = EnglishLanguageModel.Instance.GetTrigramLMScore(candidateInPOS, sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) + 1].Text.ToLower(), sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) + 2].Text.ToLower()); scores[candidateInPOS].Add("lm-trigram-post", lmScore.HasValue ? lmScore.Value : -100); } else { scores[candidateInPOS].Add("lm-trigram-post", 0); } } } } } catch { } }); } LastSubstitutionCandidates = new List <Tuple <TokenAnnotation, List <string> > >(); LastSubstitutionCandidates.Add(new Tuple <TokenAnnotation, List <string> >(targetToken, scores.Select(x => x.Key).ToList())); if (scores.Count > 0) { var allRanks = new List <Dictionary <string, int> >(); metrics.ForEach(m => { var featDict = scores.ToDictionary(x => x.Key, x => x.Value[m]); allRanks.Add(TrainingExample.RankExamplesByNumericFeature(featDict, m == "length")); }); var allCandidates = scores.Select(x => x.Key).ToList(); Dictionary <string, double> averageRankings = allCandidates.ToDictionary(x => x, x => allRanks.Select(r => r[x]).Average()); var finalRanking = averageRankings.OrderBy(r => r.Value).ToList(); double topScore = finalRanking[0].Value; var equal = new List <string>(); finalRanking.ForEach(fr => { if (fr.Value == topScore) { equal.Add(fr.Key); } }); var finalChoice = equal.Where(eq => equal.Where(eq2 => eq2 != eq).All(eq2 => scores[eq]["sim"] >= scores[eq2]["sim"])).First(); substitutions.Add(new Tuple <TokenAnnotation, string>(targetToken, finalChoice)); } } } }); return(substitutions); }