private void AnnotateTokens <Token>(IList <TOKEN> tokens)
            where Token : CoreLabel
        {
            // Make a copy of the tokens before annotating because QuantifiableEntityNormalizer may change the POS too
            IList <CoreLabel> words = new List <CoreLabel>();

            foreach (CoreLabel token in tokens)
            {
                CoreLabel word = new CoreLabel();
                word.SetWord(token.Word());
                word.SetNER(token.Ner());
                word.SetTag(token.Tag());
                // copy fields potentially set by SUTime
                NumberSequenceClassifier.TransferAnnotations(token, word);
                words.Add(word);
            }
            DoOneSentence(words);
            // TODO: If collapsed is set, tokens for entities are collapsed into one node then
            // (words.size() != tokens.size() and the logic below just don't work!!!
            for (int i = 0; i < words.Count; i++)
            {
                string ner = words[i].Ner();
                tokens[i].SetNER(ner);
                tokens[i].Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), words[i].Get(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation)));
            }
        }
        private CoreLabel MkLabel(string word, string ner)
        {
            CoreLabel label = new CoreLabel();

            label.SetWord(word);
            label.SetOriginalText(word);
            label.SetNER(ner);
            return(label);
        }
Example #3
0
        /// <summary>Parse a CoNLL formatted tree into a SemanticGraph object (along with a list of tokens).</summary>
        /// <param name="conll">The CoNLL formatted tree.</param>
        /// <returns>
        /// A pair of a SemanticGraph and a token list, corresponding to the parse of the sentence
        /// and to tokens in the sentence.
        /// </returns>
        protected internal virtual Pair <SemanticGraph, IList <CoreLabel> > MkTree(string conll)
        {
            IList <CoreLabel> sentence = new List <CoreLabel>();
            SemanticGraph     tree     = new SemanticGraph();

            foreach (string line in conll.Split("\n"))
            {
                if (line.Trim().Equals(string.Empty))
                {
                    continue;
                }
                string[]  fields = line.Trim().Split("\\s+");
                int       index  = System.Convert.ToInt32(fields[0]);
                string    word   = fields[1];
                CoreLabel label  = IETestUtils.MkWord(word, index);
                sentence.Add(label);
                if (fields[2].Equals("0"))
                {
                    tree.AddRoot(new IndexedWord(label));
                }
                else
                {
                    tree.AddVertex(new IndexedWord(label));
                }
                if (fields.Length > 4)
                {
                    label.SetTag(fields[4]);
                }
                if (fields.Length > 5)
                {
                    label.SetNER(fields[5]);
                }
                if (fields.Length > 6)
                {
                    label.SetLemma(fields[6]);
                }
            }
            int i = 0;

            foreach (string line_1 in conll.Split("\n"))
            {
                if (line_1.Trim().Equals(string.Empty))
                {
                    continue;
                }
                string[] fields = line_1.Trim().Split("\\s+");
                int      parent = System.Convert.ToInt32(fields[2]);
                string   reln   = fields[3];
                if (parent > 0)
                {
                    tree.AddEdge(new IndexedWord(sentence[parent - 1]), new IndexedWord(sentence[i]), new GrammaticalRelation(Language.UniversalEnglish, reln, null, null), 1.0, false);
                }
                i += 1;
            }
            return(Pair.MakePair(tree, sentence));
        }
        private void AddAcronyms(Annotation ann)
        {
            // Find all the organizations in a document
            IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>();

            foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)));
            }
            IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >();

            foreach (ICoreMap mention in allMentionsSoFar)
            {
                if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass)))
                {
                    organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation)));
                }
            }
            // Skip very long documents
            if (organizations.Count > 100)
            {
                return;
            }
            // Iterate over tokens...
            foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                IList <ICoreMap>  sentenceMentions = new List <ICoreMap>();
                IList <CoreLabel> tokens           = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation));
                int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                for (int i = 0; i < tokens.Count; ++i)
                {
                    // ... that look like they might be an acronym and are not already a mention
                    CoreLabel token = tokens[i];
                    if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3)
                    {
                        foreach (IList <CoreLabel> org in organizations)
                        {
                            // ... and actually are an acronym
                            if (AcronymMatcher.IsAcronym(token.Word(), org))
                            {
                                // ... and add them.
                                // System.out.println("found ACRONYM ORG");
                                token.SetNER("ORGANIZATION");
                                ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null);
                                chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION");
                                sentenceMentions.Add(chunk);
                            }
                        }
                    }
                }
            }
        }
 public virtual void SetUp()
 {
     lock (typeof(RegexNERSequenceClassifierTest))
     {
         if (tempFile == null)
         {
             tempFile = File.CreateTempFile("regexnertest.patterns", "txt");
             FileWriter     fout = new FileWriter(tempFile);
             BufferedWriter bout = new BufferedWriter(fout);
             bout.Write("sausage\tfood\n");
             bout.Write("(avocet|curlew)(s?)\tshorebird\n");
             bout.Write("shoreline park\tpark\n");
             bout.Flush();
             fout.Close();
         }
     }
     sentences    = new List <IList <CoreLabel> >();
     NERsentences = new List <IList <CoreLabel> >();
     NUnit.Framework.Assert.AreEqual(words.Length, tags.Length);
     NUnit.Framework.Assert.AreEqual(words.Length, ner.Length);
     for (int snum = 0; snum < words.Length; ++snum)
     {
         string[] wordPieces = words[snum].Split(" ");
         string[] tagPieces  = tags[snum].Split(" ");
         string[] nerPieces  = ner[snum].Split(" ");
         NUnit.Framework.Assert.AreEqual(wordPieces.Length, tagPieces.Length);
         NUnit.Framework.Assert.AreEqual(wordPieces.Length, nerPieces.Length, "Input " + snum + " " + words[snum] + " of different length than " + ner[snum]);
         IList <CoreLabel> sentence    = new List <CoreLabel>();
         IList <CoreLabel> NERsentence = new List <CoreLabel>();
         for (int wnum = 0; wnum < wordPieces.Length; ++wnum)
         {
             CoreLabel token = new CoreLabel();
             token.SetWord(wordPieces[wnum]);
             token.SetTag(tagPieces[wnum]);
             sentence.Add(token);
             CoreLabel NERtoken = new CoreLabel();
             NERtoken.SetWord(wordPieces[wnum]);
             NERtoken.SetTag(tagPieces[wnum]);
             NERtoken.SetNER(nerPieces[wnum]);
             NERsentence.Add(NERtoken);
         }
         sentences.Add(sentence);
         NERsentences.Add(NERsentence);
     }
 }
Example #6
0
        /// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary>
        private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string
                                                                                                                                                                                                                                                  > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid)
        {
            // Error checks
            if (lemmas.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            if (pos.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            if (ner.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            // Create structure
            IList <CoreLabel> tokens = new List <CoreLabel>(words.Count);
            int beginChar            = 0;

            for (int i = 0; i < words.Count; ++i)
            {
                CoreLabel token = new CoreLabel(12);
                token.SetWord(words[i]);
                token.SetValue(words[i]);
                token.SetBeginPosition(beginChar);
                token.SetEndPosition(beginChar + words[i].Length);
                beginChar += words[i].Length + 1;
                token.SetLemma(lemmas[i]);
                token.SetTag(pos[i]);
                token.SetNER(ner[i]);
                token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
                token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
                token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1);
                token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i);
                token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1);
                tokens.Add(token);
            }
            gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t");
            ICoreMap sentence = new ArrayCoreMap(16);

            sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            SemanticGraph graph = tree.Apply(tokens);

            sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph);
            SemanticGraph maltGraph = maltTree.Apply(tokens);

            sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph);
            sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
            sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
            sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss);
            sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0);
            sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count);
            Annotation doc = new Annotation(gloss);

            doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence));
            doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
            doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
            return(doc);
        }
Example #7
0
        protected internal virtual ICollection <string> Clauses(string conll)
        {
            IList <CoreLabel> sentence = new List <CoreLabel>();
            SemanticGraph     tree     = new SemanticGraph();

            foreach (string line in conll.Split("\n"))
            {
                if (line.Trim().Equals(string.Empty))
                {
                    continue;
                }
                string[]  fields = line.Trim().Split("\\s+");
                int       index  = System.Convert.ToInt32(fields[0]);
                string    word   = fields[1];
                CoreLabel label  = MkWord(word, index);
                sentence.Add(label);
                if (fields[2].Equals("0"))
                {
                    tree.AddRoot(new IndexedWord(label));
                }
                else
                {
                    tree.AddVertex(new IndexedWord(label));
                }
                if (fields.Length > 4)
                {
                    label.SetTag(fields[4]);
                }
                if (fields.Length > 5)
                {
                    label.SetNER(fields[5]);
                }
                if (fields.Length > 6)
                {
                    label.SetLemma(fields[6]);
                }
            }
            int i = 0;

            foreach (string line_1 in conll.Split("\n"))
            {
                if (line_1.Trim().Equals(string.Empty))
                {
                    continue;
                }
                string[] fields = line_1.Trim().Split("\\s+");
                int      parent = System.Convert.ToInt32(fields[2]);
                string   reln   = fields[3];
                if (parent > 0)
                {
                    tree.AddEdge(new IndexedWord(sentence[parent - 1]), new IndexedWord(sentence[i]), new GrammaticalRelation(Language.English, reln, null, null), 1.0, false);
                }
                i += 1;
            }
            // Run extractor
            ClauseSplitterSearchProblem problem = new ClauseSplitterSearchProblem(tree, true);
            ICollection <string>        clauses = new HashSet <string>();

            problem.Search(null, new LinearClassifier <ClauseSplitter.ClauseClassifierLabel, string>(new ClassicCounter <Pair <string, ClauseSplitter.ClauseClassifierLabel> >()), ClauseSplitterSearchProblem.HardSplits, null, 100000);
            return(clauses);
        }