private void AddAcronyms(Annotation ann)
        {
            // Find all the organizations in a document
            IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>();

            foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)));
            }
            IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >();

            foreach (ICoreMap mention in allMentionsSoFar)
            {
                if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass)))
                {
                    organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation)));
                }
            }
            // Skip very long documents
            if (organizations.Count > 100)
            {
                return;
            }
            // Iterate over tokens...
            foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                IList <ICoreMap>  sentenceMentions = new List <ICoreMap>();
                IList <CoreLabel> tokens           = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation));
                int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                for (int i = 0; i < tokens.Count; ++i)
                {
                    // ... that look like they might be an acronym and are not already a mention
                    CoreLabel token = tokens[i];
                    if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3)
                    {
                        foreach (IList <CoreLabel> org in organizations)
                        {
                            // ... and actually are an acronym
                            if (AcronymMatcher.IsAcronym(token.Word(), org))
                            {
                                // ... and add them.
                                // System.out.println("found ACRONYM ORG");
                                token.SetNER("ORGANIZATION");
                                ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null);
                                chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION");
                                sentenceMentions.Add(chunk);
                            }
                        }
                    }
                }
            }
        }
예제 #2
0
        public static ICounter <string> ExtractFeatures(Mention p, ICollection <Mention> shares, ICollection <string> neStrings, Dictionaries dict, Properties props)
        {
            ICounter <string> features = new ClassicCounter <string>();
            string            span     = p.LowercaseNormalizedSpanString();
            string            ner      = p.headWord.Ner();
            int sIdx = p.startIndex;
            int eIdx = p.endIndex;
            IList <CoreLabel> sent      = p.sentenceWords;
            CoreLabel         preWord   = (sIdx == 0) ? null : sent[sIdx - 1];
            CoreLabel         nextWord  = (eIdx == sent.Count) ? null : sent[eIdx];
            CoreLabel         firstWord = p.originalSpan[0];
            CoreLabel         lastWord  = p.originalSpan[p.originalSpan.Count - 1];

            features.IncrementCount("B-NETYPE-" + ner);
            if (neStrings.Contains(span))
            {
                features.IncrementCount("B-NE-STRING-EXIST");
                if ((preWord == null || !preWord.Ner().Equals(ner)) && (nextWord == null || !nextWord.Ner().Equals(ner)))
                {
                    features.IncrementCount("B-NE-FULLSPAN");
                }
            }
            if (preWord != null)
            {
                features.IncrementCount("B-PRECEDINGWORD-" + preWord.Word());
            }
            if (nextWord != null)
            {
                features.IncrementCount("B-FOLLOWINGWORD-" + nextWord.Word());
            }
            if (preWord != null)
            {
                features.IncrementCount("B-PRECEDINGPOS-" + preWord.Tag());
            }
            if (nextWord != null)
            {
                features.IncrementCount("B-FOLLOWINGPOS-" + nextWord.Tag());
            }
            features.IncrementCount("B-FIRSTWORD-" + firstWord.Word());
            features.IncrementCount("B-FIRSTPOS-" + firstWord.Tag());
            features.IncrementCount("B-LASTWORD-" + lastWord.Word());
            features.IncrementCount("B-LASTWORD-" + lastWord.Tag());
            foreach (Mention s in shares)
            {
                if (s == p)
                {
                    continue;
                }
                if (s.InsideIn(p))
                {
                    features.IncrementCount("B-BIGGER-THAN-ANOTHER");
                    break;
                }
            }
            foreach (Mention s_1 in shares)
            {
                if (s_1 == p)
                {
                    continue;
                }
                if (p.InsideIn(s_1))
                {
                    features.IncrementCount("B-SMALLER-THAN-ANOTHER");
                    break;
                }
            }
            return(features);
        }