private void AnnotateTokens <Token>(IList <TOKEN> tokens) where Token : CoreLabel { // Make a copy of the tokens before annotating because QuantifiableEntityNormalizer may change the POS too IList <CoreLabel> words = new List <CoreLabel>(); foreach (CoreLabel token in tokens) { CoreLabel word = new CoreLabel(); word.SetWord(token.Word()); word.SetNER(token.Ner()); word.SetTag(token.Tag()); // copy fields potentially set by SUTime NumberSequenceClassifier.TransferAnnotations(token, word); words.Add(word); } DoOneSentence(words); // TODO: If collapsed is set, tokens for entities are collapsed into one node then // (words.size() != tokens.size() and the logic below just don't work!!! for (int i = 0; i < words.Count; i++) { string ner = words[i].Ner(); tokens[i].SetNER(ner); tokens[i].Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), words[i].Get(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation))); } }
private CoreLabel MkLabel(string word, string ner) { CoreLabel label = new CoreLabel(); label.SetWord(word); label.SetOriginalText(word); label.SetNER(ner); return(label); }
/// <summary>Parse a CoNLL formatted tree into a SemanticGraph object (along with a list of tokens).</summary> /// <param name="conll">The CoNLL formatted tree.</param> /// <returns> /// A pair of a SemanticGraph and a token list, corresponding to the parse of the sentence /// and to tokens in the sentence. /// </returns> protected internal virtual Pair <SemanticGraph, IList <CoreLabel> > MkTree(string conll) { IList <CoreLabel> sentence = new List <CoreLabel>(); SemanticGraph tree = new SemanticGraph(); foreach (string line in conll.Split("\n")) { if (line.Trim().Equals(string.Empty)) { continue; } string[] fields = line.Trim().Split("\\s+"); int index = System.Convert.ToInt32(fields[0]); string word = fields[1]; CoreLabel label = IETestUtils.MkWord(word, index); sentence.Add(label); if (fields[2].Equals("0")) { tree.AddRoot(new IndexedWord(label)); } else { tree.AddVertex(new IndexedWord(label)); } if (fields.Length > 4) { label.SetTag(fields[4]); } if (fields.Length > 5) { label.SetNER(fields[5]); } if (fields.Length > 6) { label.SetLemma(fields[6]); } } int i = 0; foreach (string line_1 in conll.Split("\n")) { if (line_1.Trim().Equals(string.Empty)) { continue; } string[] fields = line_1.Trim().Split("\\s+"); int parent = System.Convert.ToInt32(fields[2]); string reln = fields[3]; if (parent > 0) { tree.AddEdge(new IndexedWord(sentence[parent - 1]), new IndexedWord(sentence[i]), new GrammaticalRelation(Language.UniversalEnglish, reln, null, null), 1.0, false); } i += 1; } return(Pair.MakePair(tree, sentence)); }
private void AddAcronyms(Annotation ann) { // Find all the organizations in a document IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>(); foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation))); } IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >(); foreach (ICoreMap mention in allMentionsSoFar) { if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass))) { organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation))); } } // Skip very long documents if (organizations.Count > 100) { return; } // Iterate over tokens... foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <ICoreMap> sentenceMentions = new List <ICoreMap>(); IList <CoreLabel> tokens = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation)); int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); for (int i = 0; i < tokens.Count; ++i) { // ... that look like they might be an acronym and are not already a mention CoreLabel token = tokens[i]; if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3) { foreach (IList <CoreLabel> org in organizations) { // ... and actually are an acronym if (AcronymMatcher.IsAcronym(token.Word(), org)) { // ... and add them. // System.out.println("found ACRONYM ORG"); token.SetNER("ORGANIZATION"); ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null); chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION"); sentenceMentions.Add(chunk); } } } } } }
public virtual void SetUp() { lock (typeof(RegexNERSequenceClassifierTest)) { if (tempFile == null) { tempFile = File.CreateTempFile("regexnertest.patterns", "txt"); FileWriter fout = new FileWriter(tempFile); BufferedWriter bout = new BufferedWriter(fout); bout.Write("sausage\tfood\n"); bout.Write("(avocet|curlew)(s?)\tshorebird\n"); bout.Write("shoreline park\tpark\n"); bout.Flush(); fout.Close(); } } sentences = new List <IList <CoreLabel> >(); NERsentences = new List <IList <CoreLabel> >(); NUnit.Framework.Assert.AreEqual(words.Length, tags.Length); NUnit.Framework.Assert.AreEqual(words.Length, ner.Length); for (int snum = 0; snum < words.Length; ++snum) { string[] wordPieces = words[snum].Split(" "); string[] tagPieces = tags[snum].Split(" "); string[] nerPieces = ner[snum].Split(" "); NUnit.Framework.Assert.AreEqual(wordPieces.Length, tagPieces.Length); NUnit.Framework.Assert.AreEqual(wordPieces.Length, nerPieces.Length, "Input " + snum + " " + words[snum] + " of different length than " + ner[snum]); IList <CoreLabel> sentence = new List <CoreLabel>(); IList <CoreLabel> NERsentence = new List <CoreLabel>(); for (int wnum = 0; wnum < wordPieces.Length; ++wnum) { CoreLabel token = new CoreLabel(); token.SetWord(wordPieces[wnum]); token.SetTag(tagPieces[wnum]); sentence.Add(token); CoreLabel NERtoken = new CoreLabel(); NERtoken.SetWord(wordPieces[wnum]); NERtoken.SetTag(tagPieces[wnum]); NERtoken.SetNER(nerPieces[wnum]); NERsentence.Add(NERtoken); } sentences.Add(sentence); NERsentences.Add(NERsentence); } }
/// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary> private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid) { // Error checks if (lemmas.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (pos.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (ner.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } // Create structure IList <CoreLabel> tokens = new List <CoreLabel>(words.Count); int beginChar = 0; for (int i = 0; i < words.Count; ++i) { CoreLabel token = new CoreLabel(12); token.SetWord(words[i]); token.SetValue(words[i]); token.SetBeginPosition(beginChar); token.SetEndPosition(beginChar + words[i].Length); beginChar += words[i].Length + 1; token.SetLemma(lemmas[i]); token.SetTag(pos[i]); token.SetNER(ner[i]); token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1); token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i); token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1); tokens.Add(token); } gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t"); ICoreMap sentence = new ArrayCoreMap(16); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); SemanticGraph graph = tree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph); SemanticGraph maltGraph = maltTree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss); sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0); sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count); Annotation doc = new Annotation(gloss); doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence)); doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); return(doc); }
protected internal virtual ICollection <string> Clauses(string conll) { IList <CoreLabel> sentence = new List <CoreLabel>(); SemanticGraph tree = new SemanticGraph(); foreach (string line in conll.Split("\n")) { if (line.Trim().Equals(string.Empty)) { continue; } string[] fields = line.Trim().Split("\\s+"); int index = System.Convert.ToInt32(fields[0]); string word = fields[1]; CoreLabel label = MkWord(word, index); sentence.Add(label); if (fields[2].Equals("0")) { tree.AddRoot(new IndexedWord(label)); } else { tree.AddVertex(new IndexedWord(label)); } if (fields.Length > 4) { label.SetTag(fields[4]); } if (fields.Length > 5) { label.SetNER(fields[5]); } if (fields.Length > 6) { label.SetLemma(fields[6]); } } int i = 0; foreach (string line_1 in conll.Split("\n")) { if (line_1.Trim().Equals(string.Empty)) { continue; } string[] fields = line_1.Trim().Split("\\s+"); int parent = System.Convert.ToInt32(fields[2]); string reln = fields[3]; if (parent > 0) { tree.AddEdge(new IndexedWord(sentence[parent - 1]), new IndexedWord(sentence[i]), new GrammaticalRelation(Language.English, reln, null, null), 1.0, false); } i += 1; } // Run extractor ClauseSplitterSearchProblem problem = new ClauseSplitterSearchProblem(tree, true); ICollection <string> clauses = new HashSet <string>(); problem.Search(null, new LinearClassifier <ClauseSplitter.ClauseClassifierLabel, string>(new ClassicCounter <Pair <string, ClauseSplitter.ClauseClassifierLabel> >()), ClauseSplitterSearchProblem.HardSplits, null, 100000); return(clauses); }