private void AddAcronyms(Annotation ann) { // Find all the organizations in a document IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>(); foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))); } IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >(); foreach (ICoreMap mention in allMentionsSoFar) { if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass))) { organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation))); } } // Skip very long documents if (organizations.Count > 100) { return; } // Iterate over tokens... foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <ICoreMap> sentenceMentions = new List <ICoreMap>(); IList <CoreLabel> tokens = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation)); int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); for (int i = 0; i < tokens.Count; ++i) { // ... that look like they might be an acronym and are not already a mention CoreLabel token = tokens[i]; if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3) { foreach (IList <CoreLabel> org in organizations) { // ... and actually are an acronym if (AcronymMatcher.IsAcronym(token.Word(), org)) { // ... and add them. // System.out.println("found ACRONYM ORG"); token.SetNER("ORGANIZATION"); ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null); chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION"); sentenceMentions.Add(chunk); } } } } } }
public static ICounter <string> ExtractFeatures(Mention p, ICollection <Mention> shares, ICollection <string> neStrings, Dictionaries dict, Properties props) { ICounter <string> features = new ClassicCounter <string>(); string span = p.LowercaseNormalizedSpanString(); string ner = p.headWord.Ner(); int sIdx = p.startIndex; int eIdx = p.endIndex; IList <CoreLabel> sent = p.sentenceWords; CoreLabel preWord = (sIdx == 0) ? null : sent[sIdx - 1]; CoreLabel nextWord = (eIdx == sent.Count) ? null : sent[eIdx]; CoreLabel firstWord = p.originalSpan[0]; CoreLabel lastWord = p.originalSpan[p.originalSpan.Count - 1]; features.IncrementCount("B-NETYPE-" + ner); if (neStrings.Contains(span)) { features.IncrementCount("B-NE-STRING-EXIST"); if ((preWord == null || !preWord.Ner().Equals(ner)) && (nextWord == null || !nextWord.Ner().Equals(ner))) { features.IncrementCount("B-NE-FULLSPAN"); } } if (preWord != null) { features.IncrementCount("B-PRECEDINGWORD-" + preWord.Word()); } if (nextWord != null) { features.IncrementCount("B-FOLLOWINGWORD-" + nextWord.Word()); } if (preWord != null) { features.IncrementCount("B-PRECEDINGPOS-" + preWord.Tag()); } if (nextWord != null) { features.IncrementCount("B-FOLLOWINGPOS-" + nextWord.Tag()); } features.IncrementCount("B-FIRSTWORD-" + firstWord.Word()); features.IncrementCount("B-FIRSTPOS-" + firstWord.Tag()); features.IncrementCount("B-LASTWORD-" + lastWord.Word()); features.IncrementCount("B-LASTWORD-" + lastWord.Tag()); foreach (Mention s in shares) { if (s == p) { continue; } if (s.InsideIn(p)) { features.IncrementCount("B-BIGGER-THAN-ANOTHER"); break; } } foreach (Mention s_1 in shares) { if (s_1 == p) { continue; } if (p.InsideIn(s_1)) { features.IncrementCount("B-SMALLER-THAN-ANOTHER"); break; } } return(features); }