Ejemplo n.º 1
0
        public static void Main(string[] args)
        {
            // set up pipeline properties
            Properties props = new Properties();

            // set the list of annotators to run
            props.SetProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,depparse,coref,kbp,quote");
            // set a property for an annotator, in this case the coref annotator is being set to use the neural algorithm
            props.SetProperty("coref.algorithm", "neural");
            // build pipeline
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
            // create a document object
            CoreDocument document = new CoreDocument(text);

            // annnotate the document
            pipeline.Annotate(document);
            // examples
            // 10th token of the document
            CoreLabel token = document.Tokens()[10];

            System.Console.Out.WriteLine("Example: token");
            System.Console.Out.WriteLine(token);
            System.Console.Out.WriteLine();
            // text of the first sentence
            string sentenceText = document.Sentences()[0].Text();

            System.Console.Out.WriteLine("Example: sentence");
            System.Console.Out.WriteLine(sentenceText);
            System.Console.Out.WriteLine();
            // second sentence
            CoreSentence sentence = document.Sentences()[1];
            // list of the part-of-speech tags for the second sentence
            IList <string> posTags = sentence.PosTags();

            System.Console.Out.WriteLine("Example: pos tags");
            System.Console.Out.WriteLine(posTags);
            System.Console.Out.WriteLine();
            // list of the ner tags for the second sentence
            IList <string> nerTags = sentence.NerTags();

            System.Console.Out.WriteLine("Example: ner tags");
            System.Console.Out.WriteLine(nerTags);
            System.Console.Out.WriteLine();
            // constituency parse for the second sentence
            Tree constituencyParse = sentence.ConstituencyParse();

            System.Console.Out.WriteLine("Example: constituency parse");
            System.Console.Out.WriteLine(constituencyParse);
            System.Console.Out.WriteLine();
            // dependency parse for the second sentence
            SemanticGraph dependencyParse = sentence.DependencyParse();

            System.Console.Out.WriteLine("Example: dependency parse");
            System.Console.Out.WriteLine(dependencyParse);
            System.Console.Out.WriteLine();
            // kbp relations found in fifth sentence
            IList <RelationTriple> relations = document.Sentences()[4].Relations();

            System.Console.Out.WriteLine("Example: relation");
            System.Console.Out.WriteLine(relations[0]);
            System.Console.Out.WriteLine();
            // entity mentions in the second sentence
            IList <CoreEntityMention> entityMentions = sentence.EntityMentions();

            System.Console.Out.WriteLine("Example: entity mentions");
            System.Console.Out.WriteLine(entityMentions);
            System.Console.Out.WriteLine();
            // coreference between entity mentions
            CoreEntityMention originalEntityMention = document.Sentences()[3].EntityMentions()[1];

            System.Console.Out.WriteLine("Example: original entity mention");
            System.Console.Out.WriteLine(originalEntityMention);
            System.Console.Out.WriteLine("Example: canonical entity mention");
            System.Console.Out.WriteLine(originalEntityMention.CanonicalEntityMention().Get());
            System.Console.Out.WriteLine();
            // get document wide coref info
            IDictionary <int, CorefChain> corefChains = document.CorefChains();

            System.Console.Out.WriteLine("Example: coref chains for document");
            System.Console.Out.WriteLine(corefChains);
            System.Console.Out.WriteLine();
            // get quotes in document
            IList <CoreQuote> quotes = document.Quotes();
            CoreQuote         quote  = quotes[0];

            System.Console.Out.WriteLine("Example: quote");
            System.Console.Out.WriteLine(quote);
            System.Console.Out.WriteLine();
            // original speaker of quote
            // note that quote.speaker() returns an Optional
            System.Console.Out.WriteLine("Example: original speaker of quote");
            System.Console.Out.WriteLine(quote.Speaker().Get());
            System.Console.Out.WriteLine();
            // canonical speaker of quote
            System.Console.Out.WriteLine("Example: canonical speaker of quote");
            System.Console.Out.WriteLine(quote.CanonicalSpeaker().Get());
            System.Console.Out.WriteLine();
        }
        protected internal virtual bool SameEntityWithoutLinking(CoreEntityMention emOne, CoreEntityMention emTwo)
        {
            string type = emOne.EntityType();

            if (type.Equals(NerPerson) && emOne.Tokens().Count >= 2 && emTwo.Tokens().Count >= 2 && emOne.Tokens()[emOne.Tokens().Count - 1].Word().ToLower().Equals(emTwo.Tokens()[emTwo.Tokens().Count - 1].Word().ToLower()))
            {
                string firstNameOne = emOne.Tokens()[0].Word().ToLower();
                string firstNameTwo = emTwo.Tokens()[0].Word().ToLower();
                if (FirstNameMatch(firstNameOne, firstNameTwo))
                {
                    return(true);
                }
                else
                {
                    if (emOne.Tokens().Count == 2 && emTwo.Tokens().Count == 2)
                    {
                        return(false);
                    }
                }
            }
            // Proper match score
            double matchScore = Math.Max(ApproximateEntityMatchScore(emOne.Text(), emTwo.Text()), ApproximateEntityMatchScore(emTwo.Text(), emOne.Text()));

            // Some simple cases
            if (matchScore == 1.0)
            {
                return(true);
            }
            if (matchScore < 0.34)
            {
                return(false);
            }
            if (type.Equals(NerPerson) && matchScore > 0.49)
            {
                // Both entities are more than one character
                if (Math.Min(emOne.Text().Length, emTwo.Text().Length) > 1)
                {
                    // Last names match
                    if ((emOne.Tokens().Count == 1 && emTwo.Tokens().Count > 1 && Sharpen.Runtime.EqualsIgnoreCase(emTwo.Tokens()[emTwo.Tokens().Count - 1].Word(), emOne.Tokens()[0].Word())) || (emTwo.Tokens().Count == 1 && emOne.Tokens().Count > 1 && Sharpen.Runtime.EqualsIgnoreCase
                                                                                                                                                                                                       (emOne.Tokens()[emOne.Tokens().Count - 1].Word(), emTwo.Tokens()[0].Word())))
                    {
                        return(true);
                    }
                    // First names match
                    if ((emOne.Tokens().Count == 1 && emTwo.Tokens().Count > 1 && Sharpen.Runtime.EqualsIgnoreCase(emTwo.Tokens()[0].Word(), emOne.Tokens()[0].Word())) || (emTwo.Tokens().Count == 1 && emOne.Tokens().Count > 1 && Sharpen.Runtime.EqualsIgnoreCase
                                                                                                                                                                                (emOne.Tokens()[0].Word(), emTwo.Tokens()[0].Word())))
                    {
                        return(true);
                    }
                }
                if (matchScore > 0.65)
                {
                    return(true);
                }
            }
            if (type == NerOrganization && matchScore > 0.79)
            {
                return(true);
            }
            return(false);
        }
        private static void ExtractNERTags(CoreDocument coredoc, Lucene.Net.Documents.Document document)
        {
            //I have no clue as to why NER-tagged messages are stored like that. I guess there is some deep idea behind copying the same info over and over again (or, most likely, this is because some documents have more than one sentence. even tho its stil really stupid)
            if (coredoc != null)
            {
                List nerList = coredoc.entityMentions();
                if (nerList.size() > 0)
                {
                    for (int j = 0; j < nerList.size(); j++)
                    {
                        CoreEntityMention em = (CoreEntityMention)nerList.get(j);
                        //Does this need to be a switch case?
                        if (em.entityType() == "DATE")
                        {
                            var datekey = document.GetField("id").GetInt32Value().Value;
                            if (!DateList.ContainsKey(datekey))
                            {
                                DateList.Add(datekey, em.text());
                            }
                            else
                            {
                                DateList.TryUpdate(datekey, DateList[datekey] + ", " + em.text());
                            }
                        }
                        if (em.entityType() == "TIME")
                        {
                            var timekey = document.GetField("id").GetInt32Value().Value;
                            if (!TimeList.ContainsKey(timekey))
                            {
                                TimeList.Add(timekey, em.text());
                            }
                            else
                            {
                                TimeList.TryUpdate(timekey, TimeList[timekey] + ", " + em.text());
                            }
                        }

                        if (em.entityType() == "LOCATION")
                        {
                            var lockey = document.GetField("id").GetInt32Value().Value;
                            if (!LocList.ContainsKey(lockey))
                            {
                                LocList.Add(lockey, em.text());
                            }
                            else
                            {
                                LocList.TryUpdate(lockey, LocList[lockey] + ", " + em.text());
                            }
                        }
                        if (em.entityType() == "ORGANIZATION")
                        {
                            var orgkey = document.GetField("id").GetInt32Value().Value;
                            if (!OrgList.ContainsKey(orgkey))
                            {
                                OrgList.Add(orgkey, em.text());
                            }
                            else
                            {
                                OrgList.TryUpdate(orgkey, OrgList[orgkey] + ", " + em.text());
                            }
                        }

                        if (em.entityType() == "URL")
                        {
                            var urlkey = document.GetField("id").GetInt32Value().Value;
                            if (!URLList.ContainsKey(urlkey))
                            {
                                URLList.Add(urlkey, em.text());
                            }
                            else
                            {
                                URLList.TryUpdate(urlkey, OrgList[urlkey] + ", " + em.text());
                            }
                        }
                    }
                }
            }
        }