Ejemplo n.º 1
0
        private string Lexicalize(string input)
        {
            var output = new StringBuilder();

            string[] sentences = _sentenceDetector.SentenceDetect(input);
            foreach (string sentence in sentences)
            {
                string[] tokens = _tokenizer.Tokenize(sentence);
                string[] tags   = _posTagger.Tag(tokens);
                for (int currentTag = 0; currentTag < tags.Length; currentTag++)
                {
                    if (!IsStopWord.ContainsKey(tokens[currentTag]))
                    {
                        if (tags[currentTag].StartsWith("NN"))
                        {
                            if (tags[currentTag] == "NNS")
                            {
                                output.Append(_wn.Lemmatize(tokens[currentTag], "noun")).Append(" ");
                            }
                            else
                            {
                                output.Append(tokens[currentTag]).Append(" ");
                            }
                        }
                        else
                        if (tags[currentTag].StartsWith("VB"))
                        {
                            if (tags[currentTag] != "VBP")
                            {
                                output.Append(_wn.Lemmatize(tokens[currentTag], "verb")).Append(" ");
                            }
                        }
                        else
                        if (tags[currentTag].StartsWith("JJ"))
                        {
                            output.Append(tokens[currentTag]).Append(" ");
                        }
                    }
                }
            }
            return(output.ToString());
        }
Ejemplo n.º 2
0
        public void Extract(string text_segment)
        {
            if (!string.IsNullOrEmpty(text_segment))
            {
                #region Local Variables

                int           i = 0;
                int           j;
                int           k;
                int           d;
                int           l;
                int           chunkLength;
                int           chunksLength;
                string        curToken;
                List <SynSet> Senses, tmpSenses;
                SynSet        tmpSense;

                List <SentenceChunk>        Chunks         = new List <SentenceChunk>(); // This list of all chunks
                List <SentenceChunk>        tmpChunks      = new List <SentenceChunk>(); // This list of all chunks
                Dictionary <string, SynSet> CachedConcepts = new Dictionary <string, SynSet>();
                TextVectors = new List <TaggedWord>();                                   // The list that will hold all mappable terms with thier information
                List <string> MiscTerms = new List <string>();                           // The list of unmapped terms in the text
                string[]      tokens;
                string[]      sentences = _sentenceDetector.SentenceDetect(text_segment);

                #endregion

                #region Section 3.1.

                // Extract all chunks from the given text segment
                for (k = 0; k < sentences.Length; k++)
                {
                    tokens    = _tokenizer.Tokenize(sentences[k]);
                    tmpChunks = _chunker.GetChunks(tokens, _posTagger.Tag(tokens));
                    tmpChunks.RemoveAll(predicate => predicate.TaggedWords.Count == 0);
                    Chunks.AddRange(tmpChunks);
                }

                tmpChunks = null;
                tokens    = null;
                sentences = null;

                // Extract elements that will be used for Similarity Matrix Generation as the input of clustering
                chunksLength = Chunks.Count;
                while (i < chunksLength)
                {
                    // Look only inside NP chunks
                    if (Chunks[i].Tag == "NP")
                    {
                        #region Rectify NP Chunks
                        if (i + 1 < chunksLength)
                        {
                            if (Chunks[i + 1].Tag == "NP")
                            {
                                if (Chunks[i + 1].TaggedWords[0].Tag.StartsWith("NNP") || AllowedDTList.ContainsKey(Chunks[i + 1].TaggedWords[0].Word))
                                {
                                    int length = Chunks[i].TaggedWords.Count;
                                    foreach (TaggedWord w in Chunks[i + 1].TaggedWords)
                                    {
                                        w.Index = length;
                                        Chunks[i].TaggedWords.Add(w);
                                        length++;
                                    }

                                    Chunks.RemoveRange(i + 1, 1);
                                    chunksLength = chunksLength - 1;
                                }
                            }
                            else
                            if (Chunks[i + 1].Tag == "PP" && i + 2 < chunksLength)
                            {
                                if (Chunks[i + 2].TaggedWords[0].Tag.StartsWith("NNP") || AllowedDTList.ContainsKey(Chunks[i + 1].TaggedWords[0].Word))
                                {
                                    int length = Chunks[i].TaggedWords.Count;
                                    Chunks[i + 1].TaggedWords[0].Index = length;
                                    Chunks[i].TaggedWords.Add(Chunks[i + 1].TaggedWords[0]);
                                    length++;
                                    foreach (TaggedWord w in Chunks[i + 2].TaggedWords)
                                    {
                                        w.Index = length;
                                        length++;
                                        Chunks[i].TaggedWords.Add(w);
                                    }

                                    Chunks.RemoveRange(i + 1, 2);
                                    chunksLength = chunksLength - 2;
                                }
                            }
                        }
                        #endregion

                        #region Find N-Gram NNPs

                        // This part is very important:
                        // 1- Rectify any linguistic errors generated as side effect of the previous step (such as "Belly the")
                        // 2- Eliminate any syntactic errors such as Texas Rangers (sports) --> Texas Ranger (Police)
                        // since we don't alter the value of a NNP(s)

                        chunkLength = Chunks[i].TaggedWords.Count;
                        j           = 0;
                        // Loop through all chunk words
                        while (j < chunkLength)
                        {
                            if (Chunks[i].TaggedWords[j].Tag[0] == 'N')
                            {
                                // Needed for fast access to the last element in SemanticElements
                                d = TextVectors.Count() - 1;

                                // Check the probability of merging N-gram Named Entities (NNP(S)* || NNP(S)*|DT*|NNP(S)*)
                                if (Chunks[i].TaggedWords[j].Tag.StartsWith("NNP"))
                                {
                                    k = 0;

                                    // First scan to see if the pattern is satisfied
                                    for (l = j + 1; l < chunkLength; l++)
                                    {
                                        // Here to define any patterns the user may wish to apply
                                        if (
                                            Chunks[i].TaggedWords[l].Tag.StartsWith("NNP") || // allow N-Gram NNP
                                            AllowedDTList.ContainsKey(Chunks[i].TaggedWords[l].Word) || // allow adding stop words inside the NNP
                                            Chunks[i].TaggedWords[l].Tag == "CD"    // allow adding numbers inside NNP
                                            )
                                        {
                                            k++;
                                        }
                                        else
                                        {
                                            break;
                                        }
                                    }
                                    // k-value changing means a pattern has been found
                                    // if k is changed and the scanned pattern does not end with a stop word
                                    if (k > 0 && !AllowedDTList.ContainsKey(Chunks[i].TaggedWords[j + k].Word))
                                    {
                                        // Concatenate all the pattern parts ans store them in temp variable
                                        curToken = Chunks[i].TaggedWords[j].Word;
                                        for (l = j + 1; l <= j + k; l++)
                                        {
                                            curToken = curToken + " " + Chunks[i].TaggedWords[l].Word;
                                        }

                                        // Delete all the parts added in temp
                                        Chunks[i].TaggedWords.RemoveRange(j + 1, k);

                                        // rectify the sequence length after deletion
                                        chunkLength = chunkLength - k;


                                        // Check if the perv token is a capitalized JJ
                                        if (d > -1 && j > 0 && TextVectors[d].Tag == "JJ" && char.IsUpper(TextVectors[d].Word[0]))
                                        {
                                            // Replace current j with its previous j-1 word, and allocate special tag NNP*J
                                            Chunks[i].TaggedWords[j - 1].Tag  = Chunks[i].TaggedWords[j].Tag + "J";
                                            Chunks[i].TaggedWords[j - 1].Word = TextVectors[d].Word + " " + curToken;
                                            // Remove the previous word from all lists
                                            TextVectors.RemoveAt(d);
                                            Chunks[i].TaggedWords.RemoveRange(j, 1);
                                            chunkLength--;
                                            j--;
                                        }
                                        else
                                        {
                                            // Only update the current word
                                            Chunks[i].TaggedWords[j].Word = curToken;
                                        }

                                        TextVectors.Add(Chunks[i].TaggedWords[j]);
                                        // Skip the loop by k steps
                                        j = j + k;
                                    }
                                    else
                                    {
                                        // If there is no pattern match --> add singular NNP(S)
                                        // Before addition check JJ pattern
                                        if (d > -1 && j > 0 && TextVectors[d].Tag == "JJ" && char.IsUpper(TextVectors[d].Word[0]))
                                        {
                                            // Replace current j with its previous j-1 word, and allocate special tag NNP*J
                                            Chunks[i].TaggedWords[j - 1].Tag  = Chunks[i].TaggedWords[j].Tag + "J";
                                            Chunks[i].TaggedWords[j - 1].Word = TextVectors[d].Word + " " + Chunks[i].TaggedWords[j].Word;
                                            // Remove the previous word from all lists
                                            TextVectors.RemoveAt(d);
                                            Chunks[i].TaggedWords.RemoveRange(j, 1);
                                            chunkLength--;
                                            j--;
                                        }

                                        TextVectors.Add(Chunks[i].TaggedWords[j]);
                                        j++;
                                    }
                                }
                                else
                                {
                                    // If the current word is NN(S)
                                    if (Chunks[i].TaggedWords[j].Tag == "NNS")
                                    {
                                        Chunks[i].TaggedWords[j].Word = _wn.Lemmatize(Chunks[i].TaggedWords[j].Word, "noun");
                                    }

                                    // Find if the current token forms bigram WordNet concept with the previous token
                                    if (j > 0)
                                    {
                                        if (Chunks[i].TaggedWords[j - 1].Tag == "NN" || Chunks[i].TaggedWords[j - 1].Tag == "NNS" || Chunks[i].TaggedWords[j - 1].Tag == "JJ")
                                        {
                                            if (_wn.GetSynSets(Chunks[i].TaggedWords[j - 1].Word + "_" + Chunks[i].TaggedWords[j].Word, "noun").Count > 0)
                                            {
                                                Chunks[i].TaggedWords[j].Word  = Chunks[i].TaggedWords[j - 1].Word + "_" + Chunks[i].TaggedWords[j].Word;
                                                Chunks[i].TaggedWords[j].Index = Chunks[i].TaggedWords[j - 1].Index;
                                                Chunks[i].TaggedWords.RemoveRange(j - 1, 1);
                                                TextVectors.RemoveAt(d);
                                                j--;
                                                chunkLength--;
                                            }
                                        }
                                    }

                                    TextVectors.Add(Chunks[i].TaggedWords[j]);
                                    j++;
                                }
                            }
                            else
                            {
                                if (Chunks[i].TaggedWords[j].Tag[0] == 'J')
                                {
                                    // We add adjectives to increase the disambiguation accuracy
                                    TextVectors.Add(Chunks[i].TaggedWords[j]);
                                }
                                // Skip any chunk element that is not NNP(S),NN(S), or JJ(*)
                                j++;
                            }
                        }
                        #endregion

                        i++;
                    }
                    else
                    {
                        // Remove the current Chunk since it was checked during rectification phase of the previous step
                        // Keeping only NPs is for efficiency reason during the last step of the algorithm
                        Chunks.RemoveRange(i, 1);
                        chunksLength--;
                    }
                }

                #region Disambiguatation

                d = TextVectors.Count;
                // Normalize NNP* vectors before the actual disambiguatation
                // Performing normalization after disambiguatation may affects caching the concepts since the keys may change
                for (i = 0; i < d; i++)
                {
                    if (TextVectors[i].Tag.StartsWith("NNP"))
                    {
                        for (j = 0; j < d; j++)
                        {
                            if (TextVectors[j].Tag.StartsWith("NNP"))
                            {
                                if (TextVectors[i].Word.Contains(TextVectors[j].Word))
                                {
                                    TextVectors[j].Word = TextVectors[i].Word;
                                    TextVectors[j].Tag  = TextVectors[i].Tag;
                                }
                                else
                                if (TextVectors[j].Word.Contains(TextVectors[i].Word))
                                {
                                    TextVectors[i].Word = TextVectors[j].Word;
                                    TextVectors[i].Tag  = TextVectors[j].Tag;
                                }
                            }
                        }
                    }
                }



                for (i = 0; i < d; i++)
                {
                    // For limiting access to the list -- Efficiency
                    curToken = TextVectors[i].Word;
                    if (TextVectors[i].Tag == "NN" || TextVectors[i].Tag == "NNS")
                    {
                        if (CachedConcepts.ContainsKey(curToken))
                        {
                            TextVectors[i].Sense = CachedConcepts[curToken];
                        }
                        else
                        {
                            // Check availability in WordNet
                            Senses = _wn.GetSynSets(curToken, false, WordNetEngine.POS.Noun);
                            if (Senses.Count > 0)
                            {
                                tmpSense = Disambiguate(Senses, GenerateContextWindow(i, d));
                                CachedConcepts.Add(curToken, tmpSense);
                                TextVectors[i].Sense = CachedConcepts[curToken];
                            }
                        }
                    }
                    else
                    if (TextVectors[i].Tag.StartsWith("NNP"))
                    {
                        if (CachedConcepts.ContainsKey(curToken))
                        {
                            TextVectors[i].Sense = CachedConcepts[curToken];
                            continue;
                        }

                        Senses = _wn.GetSynSets(curToken.Replace(" ", "_"), false, WordNetEngine.POS.Noun);
                        if (Senses.Count > 0)
                        {
                            tmpSense = Disambiguate(Senses, GenerateContextWindow(i, d));
                            CachedConcepts.Add(curToken, tmpSense);
                            TextVectors[i].Sense = CachedConcepts[curToken];
                            continue;
                        }

                        if (PlugInsNumber > 0)
                        {
                            Senses.Clear();
                            for (l = 0; l < PlugInsNumber; l++)
                            {
                                KBDriverQueryArgs[1] = curToken;
                                tmpSenses            = KBDriversQueryPointers[l].Invoke(KBDrivers[l], KBDriverQueryArgs) as List <SynSet>;
                                if (tmpSenses != null)
                                {
                                    Senses.AddRange(tmpSenses);
                                }
                            }

                            if (Senses.Count > 0)
                            {
                                tmpSense = Disambiguate(Senses, GenerateContextWindow(i, d));
                                CachedConcepts.Add(curToken, tmpSense);
                                TextVectors[i].Sense = CachedConcepts[curToken];
                                continue;
                            }
                        }

                        if (TextVectors[i].Tag.EndsWith("J"))
                        {
                            TextVectors[i].Word = curToken.Substring(curToken.IndexOf(" ") + 1);
                            TextVectors[i].Tag  = TextVectors[i].Tag.Substring(0, TextVectors[i].Tag.Length - 1);
                            i--;
                            continue;
                        }
                    }
                }


                // Prepare the vectors for semantic similarity measurement
                // hence, any vector does not hold valid sense must be excluded from the list in temp list
                i = 0;
                while (i < d)
                {
                    if (TextVectors[i].Sense == null)
                    {
                        if (TextVectors[i].Tag.StartsWith("NNP") && !MiscTerms.Contains(TextVectors[i].Word))
                        {
                            MiscTerms.Add(TextVectors[i].Word);
                        }
                        TextVectors.RemoveAt(i);
                        d--;
                    }
                    else
                    {
                        i++;
                    }
                }
                #endregion
                // [Implicit-Dispose]
                tmpSense  = null;
                tmpSenses = null;
                Senses    = null;

                #endregion

                #region Section 3.2.

                // Row * Col - Diagonal / 2 (above or under the Diagonal)
                double[] S = new double[((d * d) - d) / 2];
                // Dummy counter
                k = 0;
                for (i = 0; i < d; i++)
                {
                    for (j = i + 1; j < d; j++)
                    {
                        S[k] = Math.Round(wupMeasure(TextVectors[i].Sense, TextVectors[j].Sense), 4);
                        k++;
                    }
                }

                // Perform clustering on S
                int[] res = ap.Run(S, d, 1, 0.9, 1000, 50);

                // Optimized Clustering information collection
                // We collect clustering information and at the same time filter out all terms that are not close to their exemplars
                Dictionary <int, List <int> > ClusRes = new Dictionary <int, List <int> >();
                // ===================================
                for (i = 0; i < res.Length; i++)
                {
                    if (!ClusRes.ContainsKey(res[i]))
                    {
                        ClusRes.Add(res[i], new List <int>());
                    }

                    if (i == res[i])
                    {
                        ClusRes[res[i]].Add(i);
                        continue;
                    }

                    if (Math.Round(wupMeasure(TextVectors[res[i]].Sense, TextVectors[i].Sense), 4) >= ClosenessToCentroid)
                    {
                        ClusRes[res[i]].Add(i);
                    }
                }

                Console.WriteLine("-> Clustering Information:\n");
                foreach (KeyValuePair <int, List <int> > kv in ClusRes)
                {
                    Console.Write("\t[" + TextVectors[kv.Key].Word + "] " + TextVectors[kv.Key].Sense.ID + " : ");
                    foreach (var item in kv.Value)
                    {
                        Console.Write(TextVectors[item].Word + ",");
                    }
                    Console.WriteLine();
                    Console.WriteLine();
                }

                // Manual averaging of exemplars (Sec. 3.2)
                Console.WriteLine("-> Remove unimportant clusters:");
                bool delFlag;
                while (true)
                {
                    delFlag = false;
                    Console.Write("\tEnter Seed:");
                    curToken = Console.ReadLine();
                    if (curToken == "$")
                    {
                        break;
                    }

                    foreach (var key in ClusRes.Keys)
                    {
                        if (TextVectors[key].Word == curToken)
                        {
                            delFlag = ClusRes.Remove(key);
                            break;
                        }
                    }
                    if (delFlag)
                    {
                        Console.WriteLine("\tCluster deleted");
                    }
                    else
                    {
                        Console.WriteLine("\tSeed is not found");
                    }
                    Console.WriteLine();
                }

                // ESA-Based averaging of exemplars
                // Insert here local server API

                #endregion

                #region Section 3.3.

                // Flatten ClusRes into List
                List <int> Seeds = ClusRes.Values
                                   .SelectMany(x => x) // Flatten
                                   .ToList();

                // Final seeds list must be sorted in case of using candidate phrase selection from a window
                //Seeds.Sort();

                List <string> CandidatePhrases    = new List <string>();
                List <string> CandidatePhraseSeed = new List <string>();

                SelectionWindowSize = Chunks.Count;
                for (i = 0; i < Chunks.Count; i++)
                {
                    if (Chunks[i].Tag == "NP")
                    {
                        d = Chunks[i].TaggedWords.Count;
                        for (l = 0; l < Seeds.Count; l++)
                        {
                            for (j = 0; j < d; j++)
                            {
                                if (Chunks[i].TaggedWords[j].Word == TextVectors[Seeds[l]].Word)
                                {
                                    if (TextVectors[Seeds[l]].Tag.StartsWith("NNP") && !CandidatePhrases.Contains(TextVectors[Seeds[l]].Word) && i < SelectionWindowSize)
                                    {
                                        CandidatePhrases.Add(TextVectors[Seeds[l]].Word);
                                        if (TextVectors[Seeds[l]].Sense.URI != null)
                                        {
                                            CandidatePhraseSeed.Add(TextVectors[Seeds[l]].Sense.URI);
                                        }
                                        else
                                        {
                                            CandidatePhraseSeed.Add("http://www.pdl.io/core_onto/" + TextVectors[Seeds[l]].Sense.ID);
                                        }
                                    }
                                    else
                                    if (TextVectors[Seeds[l]].Tag == "NN" || TextVectors[Seeds[l]].Tag == "NNS")
                                    {
                                        curToken = TextVectors[Seeds[l]].Word;
                                        if (j > 0 && Chunks[i].TaggedWords[j - 1].Tag == "JJ")
                                        {
                                            curToken = Chunks[i].TaggedWords[j - 1].Word + " " + curToken;
                                        }

                                        for (k = j + 1; k < d; k++)
                                        {
                                            if (Chunks[i].TaggedWords[k].Tag != "NN")
                                            {
                                                break;
                                            }
                                            else
                                            {
                                                curToken = curToken + " " + Chunks[i].TaggedWords[k].Word;
                                            }
                                        }

                                        if (curToken.Contains(" ") || curToken.Contains("_"))
                                        {
                                            if (!CandidatePhrases.Contains(curToken))
                                            {
                                                CandidatePhrases.Add(curToken);
                                                if (TextVectors[Seeds[l]].Sense.URI != null)
                                                {
                                                    CandidatePhraseSeed.Add(TextVectors[Seeds[l]].Sense.URI);
                                                }
                                                else
                                                {
                                                    CandidatePhraseSeed.Add("http://www.pdl.io/core_onto/" + TextVectors[Seeds[l]].Sense.ID);
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                #endregion

                // Print results
                Console.WriteLine("\n-> Candidate Keyphrases:\n");
                for (i = 0; i < CandidatePhrases.Count; i++)
                {
                    Console.WriteLine("\t" + CandidatePhrases[i].Replace("_", " ") + " , URI:" + CandidatePhraseSeed[i]);
                }

                Console.WriteLine("\n-> MISC Entities:\n");
                for (i = 0; i < MiscTerms.Count; i++)
                {
                    Console.WriteLine("\t" + MiscTerms[i]);
                }
            }
        }