Ejemplo n.º 1
0
        /// <summary>
        /// Oredring the candidates for the ranking task, when the candidates are given. The ordering is somewhat different
        /// </summary>
        /// <param name="document">The document whose text is to be simplified</param>
        /// <param name="substitutionCandidates">Substitution candidates</param>
        /// <param name="target">Target word</param>
        /// <param name="contextSize">The size of the context of the target word to be compared semantically with candidate replacements</param>
        /// <returns></returns>
        public List <string> OrderGivenSubstitutionCandidates(Document document, List <string> substitutionCandidates, string target, int contextSize)
        {
            EngMorphology morph = new EngMorphology();
            List <Tuple <TokenAnnotation, string> > substitutions = new List <Tuple <TokenAnnotation, string> >();
            //List<string> metrics = new List<string> { "sim", "ic-diff", "context-sim", "length", "lm-bigram-pre", "lm-bigram-post", "lm-trigram-pre", "lm-trigram-post" };
            List <string> metrics = new List <string> {
                "context-sim" /*, "lm-bigram-pre", "lm-bigram-post"*/, "ic-diff"                                      /*, "lm-trigram-pre", "lm-trigram-post"*/
            };
            //List<string> metrics = new List<string> { "ic-diff" };
            Dictionary <string, string> candidateChanges = new Dictionary <string, string>();

            var targetToken = document.AllTokens.Where(t => t.Text == target).Last();

            var preceedingSentencePart = document.Text.Substring(0, targetToken.StartPositionSentence);
            var followingSentencePart  = document.Text.Substring(targetToken.StartPositionSentence + targetToken.Text.Length);

            var targetLemmaIC = InformationContent.GetRelativeInformationContent(targetToken.Lemma.ToLower());
            var targetWordIC  = InformationContent.GetRelativeInformationContent(targetToken.Text.ToLower());

            var targetContextTokens = document.AllTokens.Where(t => Math.Abs(document.AllTokens.IndexOf(t) - targetToken.SentenceIndex) > 0 && Math.Abs(document.AllTokens.IndexOf(t) - targetToken.SentenceIndex) <= contextSize && t.IsContent()).ToList();

            Dictionary <string, Dictionary <string, double> > scores = new Dictionary <string, Dictionary <string, double> >();

            substitutionCandidates.ForEach(candidate => {
                var candidateText = candidate;
                if (candidateText.Contains(","))
                {
                    var splitCand   = candidateText.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries).ToList();
                    string selected = string.Empty;
                    double maxIC    = double.MinValue;
                    splitCand.ForEach(sc =>
                    {
                        var scic = InformationContent.GetRelativeInformationContent(sc.Trim().ToLower());
                        if (scic > maxIC)
                        {
                            selected = sc.Trim();
                            maxIC    = scic;
                        }
                    });
                    candidateText = selected;
                }
                if (candidateText.Trim().Contains(" "))
                {
                    var tokens        = (new EngPOSTagger()).Annotate(candidateText.Trim()).ToList();
                    var contentTokens = tokens.Where(x => ((TokenAnnotation)x).IsContent()).ToList();

                    var change = contentTokens.Count > 0 ? ((TokenAnnotation)(contentTokens.Last())).Text.Trim() : ((TokenAnnotation)(tokens.First())).Text.Trim();

                    candidateText = change;
                }

                var candidateLemmaIC = InformationContent.GetRelativeInformationContent(candidateText.ToLower());
                var candidateWordIC  = !string.IsNullOrEmpty(candidateText) ? InformationContent.GetRelativeInformationContent(candidateText.ToLower()) : 1;

                var candidateIC = candidateWordIC == 1 ? candidateLemmaIC : candidateWordIC;
                var targetIC    = targetWordIC == 1 ? targetLemmaIC : targetWordIC;

                var candidateContextSimilarities = targetContextTokens.Select(x => VectorSpace.Similarity(x.Lemma.ToLower(), candidateText.ToLower())).Where(x => x >= -1).ToList();
                var candidateContextSimilarity   = candidateContextSimilarities.Count > 0 ? candidateContextSimilarities.Average() : 0;

                scores.Add(candidate, new Dictionary <string, double>());
                var sim = VectorSpace.Similarity(targetToken.Text.ToLower().Trim(), candidateText.ToLower().Trim());
                if (sim < 1)
                {
                    scores[candidate].Add("sim", sim);
                }
                scores[candidate].Add("ic-diff", candidateIC);
                scores[candidate].Add("context-sim", candidateContextSimilarity);
                scores[candidate].Add("length", candidateText.Length);

                var tokenIndex = document.AllTokens.IndexOf(targetToken);

                // bigram LM
                if (tokenIndex > 0)
                {
                    var lmScore = EnglishLanguageModel.Instance.GetBigramLMScore(document.AllTokens[document.AllTokens.IndexOf(targetToken) - 1].Text.ToLower(), candidateText);
                    scores[candidate].Add("lm-bigram-pre", lmScore.HasValue ? lmScore.Value : -100);
                }
                else
                {
                    scores[candidate].Add("lm-bigram-pre", 0);
                }

                if (tokenIndex < document.AllTokens.Count - 1)
                {
                    var lmScore = EnglishLanguageModel.Instance.GetBigramLMScore(candidateText, document.AllTokens[document.AllTokens.IndexOf(targetToken) + 1].Text.ToLower());
                    scores[candidate].Add("lm-bigram-post", lmScore.HasValue ? lmScore.Value : -100);
                }
                else
                {
                    scores[candidate].Add("lm-bigram-pre", 0);
                }

                // trigram LM
                if (tokenIndex > 1)
                {
                    var lmScore = EnglishLanguageModel.Instance.GetTrigramLMScore(document.AllTokens[document.AllTokens.IndexOf(targetToken) - 2].Text.ToLower(), document.AllTokens[document.AllTokens.IndexOf(targetToken) - 1].Text.ToLower(), candidateText);
                    scores[candidate].Add("lm-trigram-pre", lmScore.HasValue ? lmScore.Value : -100);
                }
                else
                {
                    scores[candidate].Add("lm-trigram-pre", 0);
                }

                if (tokenIndex < document.AllTokens.Count - 2)
                {
                    var lmScore = EnglishLanguageModel.Instance.GetTrigramLMScore(candidateText, document.AllTokens[document.AllTokens.IndexOf(targetToken) + 1].Text.ToLower(), document.AllTokens[document.AllTokens.IndexOf(targetToken) + 2].Text.ToLower());
                    scores[candidate].Add("lm-trigram-post", lmScore.HasValue ? lmScore.Value : -100);
                }
                else
                {
                    scores[candidate].Add("lm-trigram-post", 0);
                }
            });

            LastSubstitutionCandidates = new List <Tuple <TokenAnnotation, List <string> > >();
            LastSubstitutionCandidates.Add(new Tuple <TokenAnnotation, List <string> >(targetToken, scores.Select(x => x.Key).ToList()));

            var allRanks = new List <Dictionary <string, int> >();

            metrics.ForEach(m =>
            {
                var featDict = scores.Where(x => x.Value.ContainsKey(m)).ToDictionary(x => x.Key, x => x.Value[m]);
                allRanks.Add(TrainingExample.RankExamplesByNumericFeature(featDict, m == "length" || m == "ic-diff"));
            });

            var allCandidates = scores.Select(x => x.Key).ToList();
            Dictionary <string, double> averageRankings = allCandidates.ToDictionary(x => x, x => allRanks.Where(y => y.ContainsKey(x)).Select(r => r[x]).Average());

            return(averageRankings.OrderBy(r => r.Value).Select(x => x.Key).ToList());
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Choosing the substitutions for target words. This means we rank the candidates according to several features (similarity with target word, information content reduction, similarity with context words, ...)
        /// </summary>
        /// <param name="document">The document whose text is to be simplified</param>
        /// <param name="substCandidates">All the pairs of target words and collected candidate replacements</param>
        /// <param name="contextSize">The size of the context of the target word to be considered for measuring the similarity between candidate words and target word context</param>
        /// <param name="noSubstitutionWords">Stopwords, never to be considered for simplification</param>
        /// <param name="similarityTreshold">The treshold for semantic similarity between target word and candidate replacement</param>
        /// <param name="icReplacementCandidateTreshold">Information content treshold for replacing the target word</param>
        /// <param name="word"></param>
        /// <returns>The list of substitutions (tuple of target token and candidate replacement word)</returns>
        public List <Tuple <TokenAnnotation, string> > GetSubstitutions(Document document, List <Tuple <TokenAnnotation, List <Tuple <string, double> > > > substCandidates, int contextSize, List <string> noSubstitutionWords, double similarityTreshold, double icReplacementCandidateTreshold, string word = null)
        {
            EngMorphology morph = new EngMorphology();
            List <Tuple <TokenAnnotation, string> > substitutions = new List <Tuple <TokenAnnotation, string> >();
            List <string> metrics = new List <string> {
                "sim", "ic-diff", "context-sim", "lm-bigram-pre", "lm-bigram-post", "lm-trigram-pre", "lm-trigram-post"
            };

            //List<string> metrics = new List<string> { "sim", "lm-bigram-pre", "lm-bigram-post", /*"ic-diff", "lm-trigram-pre", "lm-trigram-post"*/ };

            substCandidates.ForEach(sc => {
                if (!noSubstitutionWords.Contains(sc.Item1.Text.ToLower()))
                {
                    if (word == null || sc.Item1.Text.ToLower() == word)
                    {
                        Dictionary <string, Dictionary <string, double> > scores = new Dictionary <string, Dictionary <string, double> >();

                        var targetToken     = sc.Item1;
                        var sentence        = document.Sentences.Where(s => s.Tokens.Any(t => t.StartPosition == targetToken.StartPosition && t.Text == targetToken.Text)).Single();
                        var targetTokenCopy = sentence.Tokens.Where(t => t.StartPosition == targetToken.StartPosition && t.Text == targetToken.Text).Single();

                        var preceedingSentencePart = sentence.Text.Substring(0, targetTokenCopy.StartPositionSentence);
                        var followingSentencePart  = sentence.Text.Substring(targetTokenCopy.StartPositionSentence + targetTokenCopy.Text.Length);

                        var targetLemmaIC = InformationContent.GetRelativeInformationContent(targetToken.Lemma.ToLower());
                        var targetWordIC  = InformationContent.GetRelativeInformationContent(targetToken.Text.ToLower());

                        var targetContextTokens = sentence.Tokens.Where(t => Math.Abs(sentence.Tokens.IndexOf(t) - targetTokenCopy.SentenceIndex) > 0 && Math.Abs(sentence.Tokens.IndexOf(t) - targetTokenCopy.SentenceIndex) <= contextSize && t.IsContent()).ToList();

                        var targetCtxtSimilarities  = targetContextTokens.Select(x => VectorSpace.Similarity(x.Lemma.ToLower(), targetToken.Lemma.ToLower())).Where(x => x >= -1).ToList();
                        var targetContextSimilarity = targetCtxtSimilarities.Count > 0 ? targetCtxtSimilarities.Average() : 0;


                        if (sc.Item2 != null)
                        {
                            sc.Item2.ForEach(candidate =>
                            {
                                try
                                {
                                    var candidateLemmaIC = InformationContent.GetRelativeInformationContent(candidate.Item1.ToLower());
                                    string key           = candidate.Item1 + "<->" + targetToken.POSTag;

                                    //var candidateInPOS = EngMorphology.GetForm(candidate.Item1, targetToken.POSTag);
                                    //if (!CandidateInPoSLookup.ContainsKey(key)) CandidateInPoSLookup.Add(key, candidateInPOS);

                                    var candidateInPOS  = CandidateInPoSLookup.ContainsKey(key) ? CandidateInPoSLookup[key] : candidate.Item1;
                                    var candidateWordIC = !string.IsNullOrEmpty(candidateInPOS) ? InformationContent.GetRelativeInformationContent(candidateInPOS.ToLower()) : 1;

                                    var candidateIC = candidateWordIC == 1 ? candidateLemmaIC : candidateWordIC;
                                    var targetIC    = targetWordIC == 1 ? targetLemmaIC : targetWordIC;

                                    if (!string.IsNullOrEmpty(candidateInPOS) && targetLemmaIC > icReplacementCandidateTreshold && (candidateIC < targetIC /*|| Math.Abs(targetIC - candidateIC) < 0.05*/))
                                    {
                                        var artificialSentence = preceedingSentencePart + candidateInPOS + followingSentencePart;
                                        var artTokens          = (new EngPOSTagger()).Annotate(artificialSentence).Select(x => (TokenAnnotation)x).ToList();
                                        morph.AnnotateMorphology(artTokens);
                                        var candidateToken = artTokens.Where(x => x.StartPositionSentence == targetTokenCopy.StartPositionSentence /*&& x.Text == candidateInPOS*/).Single();
                                        var candidateContextSimilarities = targetContextTokens.Select(x => VectorSpace.Similarity(x.Lemma.ToLower(), candidateToken.Lemma.ToLower())).Where(x => x >= -1).ToList();

                                        var candidateContextSimilarity = candidateContextSimilarities.Count > 0 ? candidateContextSimilarities.Average() : targetContextSimilarity;
                                        // POS-tag compatibility is a second prerequisite

                                        bool sameWord = candidate.Item1.Contains(targetToken.Text) || targetToken.Text.Contains(candidate.Item1) ||
                                                        candidate.Item1.Contains(targetToken.Lemma) || targetToken.Lemma.Contains(candidate.Item1) ||
                                                        candidateInPOS.Contains(targetToken.Text) || targetToken.Text.Contains(candidateInPOS) ||
                                                        candidateInPOS.Contains(targetToken.Lemma) || targetToken.Lemma.Contains(candidateInPOS);

                                        bool sameAsContext = targetContextTokens.Any(ct =>
                                                                                     candidate.Item1.Contains(ct.Text) || ct.Text.Contains(candidate.Item1) ||
                                                                                     candidate.Item1.Contains(ct.Lemma) || ct.Lemma.Contains(candidate.Item1) ||
                                                                                     candidateInPOS.Contains(ct.Text) || ct.Text.Contains(candidateInPOS) ||
                                                                                     candidateInPOS.Contains(ct.Lemma) || ct.Lemma.Contains(candidateInPOS));

                                        if (candidate.Item2 >= similarityTreshold && (candidateToken.POSTag == targetToken.POSTag) && !sameWord && !sameAsContext)
                                        {
                                            if (!scores.ContainsKey(candidateInPOS))
                                            {
                                                scores.Add(candidateInPOS, new Dictionary <string, double>());
                                                scores[candidateInPOS].Add("sim", candidate.Item2);
                                                scores[candidateInPOS].Add("ic-diff", targetIC - candidateIC);
                                                scores[candidateInPOS].Add("context-sim", candidateContextSimilarity);
                                                scores[candidateInPOS].Add("length", candidateInPOS.Length);

                                                var tokenIndex = sentence.Tokens.IndexOf(targetTokenCopy);

                                                // bigram LM
                                                if (tokenIndex > 0)
                                                {
                                                    var lmScore = EnglishLanguageModel.Instance.GetBigramLMScore(sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) - 1].Text.ToLower(), candidateInPOS);
                                                    scores[candidateInPOS].Add("lm-bigram-pre", lmScore.HasValue ? lmScore.Value : -100);
                                                }
                                                else
                                                {
                                                    scores[candidateInPOS].Add("lm-bigram-pre", 0);
                                                }

                                                if (tokenIndex < sentence.Tokens.Count - 1)
                                                {
                                                    var lmScore = EnglishLanguageModel.Instance.GetBigramLMScore(candidateInPOS, sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) + 1].Text.ToLower());
                                                    scores[candidateInPOS].Add("lm-bigram-post", lmScore.HasValue ? lmScore.Value : -100);
                                                }
                                                else
                                                {
                                                    scores[candidateInPOS].Add("lm-bigram-post", 0);
                                                }

                                                // trigram LM
                                                if (tokenIndex > 1)
                                                {
                                                    var lmScore = EnglishLanguageModel.Instance.GetTrigramLMScore(sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) - 2].Text.ToLower(), sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) - 1].Text.ToLower(), candidateInPOS);
                                                    scores[candidateInPOS].Add("lm-trigram-pre", lmScore.HasValue ? lmScore.Value : -100);
                                                }
                                                else
                                                {
                                                    scores[candidateInPOS].Add("lm-trigram-pre", 0);
                                                }

                                                if (tokenIndex < sentence.Tokens.Count - 2)
                                                {
                                                    var lmScore = EnglishLanguageModel.Instance.GetTrigramLMScore(candidateInPOS, sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) + 1].Text.ToLower(), sentence.Tokens[sentence.Tokens.IndexOf(targetTokenCopy) + 2].Text.ToLower());
                                                    scores[candidateInPOS].Add("lm-trigram-post", lmScore.HasValue ? lmScore.Value : -100);
                                                }
                                                else
                                                {
                                                    scores[candidateInPOS].Add("lm-trigram-post", 0);
                                                }
                                            }
                                        }
                                    }
                                }
                                catch { }
                            });
                        }

                        LastSubstitutionCandidates = new List <Tuple <TokenAnnotation, List <string> > >();
                        LastSubstitutionCandidates.Add(new Tuple <TokenAnnotation, List <string> >(targetToken, scores.Select(x => x.Key).ToList()));

                        if (scores.Count > 0)
                        {
                            var allRanks = new List <Dictionary <string, int> >();
                            metrics.ForEach(m => {
                                var featDict = scores.ToDictionary(x => x.Key, x => x.Value[m]);
                                allRanks.Add(TrainingExample.RankExamplesByNumericFeature(featDict, m == "length"));
                            });

                            var allCandidates = scores.Select(x => x.Key).ToList();
                            Dictionary <string, double> averageRankings = allCandidates.ToDictionary(x => x, x => allRanks.Select(r => r[x]).Average());

                            var finalRanking = averageRankings.OrderBy(r => r.Value).ToList();
                            double topScore  = finalRanking[0].Value;
                            var equal        = new List <string>();
                            finalRanking.ForEach(fr => {
                                if (fr.Value == topScore)
                                {
                                    equal.Add(fr.Key);
                                }
                            });

                            var finalChoice = equal.Where(eq => equal.Where(eq2 => eq2 != eq).All(eq2 => scores[eq]["sim"] >= scores[eq2]["sim"])).First();
                            substitutions.Add(new Tuple <TokenAnnotation, string>(targetToken, finalChoice));
                        }
                    }
                }
            });

            return(substitutions);
        }