public virtual void TestCoreLabelSetWordBehavior() { CoreLabel foo = new CoreLabel(); foo.Set(typeof(CoreAnnotations.TextAnnotation), "foo"); foo.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "B"); foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool"); // Lemma gets removed with word ArrayCoreMap copy = new ArrayCoreMap(foo); NUnit.Framework.Assert.AreEqual(copy, foo); foo.SetWord("foo"); NUnit.Framework.Assert.AreEqual(copy, foo); // same word set foo.SetWord("bar"); NUnit.Framework.Assert.IsFalse(copy.Equals(foo)); // lemma removed foo.SetWord("foo"); NUnit.Framework.Assert.IsFalse(copy.Equals(foo)); // still removed foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool"); NUnit.Framework.Assert.AreEqual(copy, foo); // back to normal // Hash code is consistent int hashCode = foo.GetHashCode(); NUnit.Framework.Assert.AreEqual(copy.GetHashCode(), hashCode); foo.SetWord("bar"); NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode()); foo.SetWord("foo"); NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode()); // Hash code doesn't care between a value of null and the key not existing NUnit.Framework.Assert.IsTrue(foo.Lemma() == null); int lemmalessHashCode = foo.GetHashCode(); foo.Remove(typeof(CoreAnnotations.LemmaAnnotation)); NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode()); foo.SetLemma(null); NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode()); foo.SetLemma("fool"); NUnit.Framework.Assert.AreEqual(hashCode, foo.GetHashCode()); // Check equals foo.SetWord("bar"); foo.SetWord("foo"); ArrayCoreMap nulledCopy = new ArrayCoreMap(foo); NUnit.Framework.Assert.AreEqual(nulledCopy, foo); foo.Remove(typeof(CoreAnnotations.LemmaAnnotation)); NUnit.Framework.Assert.AreEqual(nulledCopy, foo); }
/// <summary>Parse a CoNLL formatted tree into a SemanticGraph object (along with a list of tokens).</summary> /// <param name="conll">The CoNLL formatted tree.</param> /// <returns> /// A pair of a SemanticGraph and a token list, corresponding to the parse of the sentence /// and to tokens in the sentence. /// </returns> protected internal virtual Pair <SemanticGraph, IList <CoreLabel> > MkTree(string conll) { IList <CoreLabel> sentence = new List <CoreLabel>(); SemanticGraph tree = new SemanticGraph(); foreach (string line in conll.Split("\n")) { if (line.Trim().Equals(string.Empty)) { continue; } string[] fields = line.Trim().Split("\\s+"); int index = System.Convert.ToInt32(fields[0]); string word = fields[1]; CoreLabel label = IETestUtils.MkWord(word, index); sentence.Add(label); if (fields[2].Equals("0")) { tree.AddRoot(new IndexedWord(label)); } else { tree.AddVertex(new IndexedWord(label)); } if (fields.Length > 4) { label.SetTag(fields[4]); } if (fields.Length > 5) { label.SetNER(fields[5]); } if (fields.Length > 6) { label.SetLemma(fields[6]); } } int i = 0; foreach (string line_1 in conll.Split("\n")) { if (line_1.Trim().Equals(string.Empty)) { continue; } string[] fields = line_1.Trim().Split("\\s+"); int parent = System.Convert.ToInt32(fields[2]); string reln = fields[3]; if (parent > 0) { tree.AddEdge(new IndexedWord(sentence[parent - 1]), new IndexedWord(sentence[i]), new GrammaticalRelation(Language.UniversalEnglish, reln, null, null), 1.0, false); } i += 1; } return(Pair.MakePair(tree, sentence)); }
public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves) { IList <ILabel> labels = tree.Yield(); foreach (ILabel label in labels) { ++nTokens; if (!(label is CoreLabel)) { throw new ArgumentException("Only works with CoreLabels trees"); } CoreLabel coreLabel = (CoreLabel)label; string lemma = coreLabel.Lemma(); //PTB escaping since we're going to put this in the leaf if (lemma == null) { // No lemma, so just add the surface form lemma = coreLabel.Word(); } else { if (lemma.Equals("(")) { lemma = "-LRB-"; } else { if (lemma.Equals(")")) { lemma = "-RRB-"; } } } if (lemmasAsLeaves) { string escapedLemma = lemma; coreLabel.SetWord(escapedLemma); coreLabel.SetValue(escapedLemma); coreLabel.SetLemma(lemma); } if (addMorphoToLeaves) { string morphStr = coreLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = MorphoFeatureSpecification.NoAnalysis; } else { ++nMorphAnalyses; } // Normalize punctuation analyses if (morphStr.StartsWith("PONCT")) { morphStr = "PUNC"; } string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr); coreLabel.SetValue(newLeaf); coreLabel.SetWord(newLeaf); } } }
/// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary> private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid) { // Error checks if (lemmas.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (pos.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (ner.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } // Create structure IList <CoreLabel> tokens = new List <CoreLabel>(words.Count); int beginChar = 0; for (int i = 0; i < words.Count; ++i) { CoreLabel token = new CoreLabel(12); token.SetWord(words[i]); token.SetValue(words[i]); token.SetBeginPosition(beginChar); token.SetEndPosition(beginChar + words[i].Length); beginChar += words[i].Length + 1; token.SetLemma(lemmas[i]); token.SetTag(pos[i]); token.SetNER(ner[i]); token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1); token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i); token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1); tokens.Add(token); } gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t"); ICoreMap sentence = new ArrayCoreMap(16); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); SemanticGraph graph = tree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph); SemanticGraph maltGraph = maltTree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss); sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0); sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count); Annotation doc = new Annotation(gloss); doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence)); doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); return(doc); }
/// <exception cref="System.IO.IOException"/> public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix) { Pattern startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); Pattern endLabelToken = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); string backgroundSymbol = "O"; IList <ICoreMap> sentences = new List <ICoreMap>(); int lineNum = -1; string l = null; while ((l = reader.ReadLine()) != null) { lineNum++; string[] t = l.Split("\t", 2); string id = null; string text = null; if (t.Length == 2) { id = t[0]; text = t[1]; } else { if (t.Length == 1) { text = t[0]; id = lineNum.ToString(); } } id = sentIDprefix + id; DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text)); PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); dp.SetTokenizerFactory(tokenizerFactory); string label = backgroundSymbol; int sentNum = -1; foreach (IList <IHasWord> sentence in dp) { sentNum++; string sentStr = string.Empty; IList <CoreLabel> sent = new List <CoreLabel>(); foreach (IHasWord tokw in sentence) { string tok = tokw.Word(); Matcher startingMatcher = startingLabelToken.Matcher(tok); Matcher endMatcher = endLabelToken.Matcher(tok); if (startingMatcher.Matches()) { //System.out.println("matched starting"); label = startingMatcher.Group(1); } else { if (endMatcher.Matches()) { //System.out.println("matched end"); label = backgroundSymbol; } else { CoreLabel c = new CoreLabel(); IList <string> toks = new List <string>(); toks.Add(tok); foreach (string toksplit in toks) { sentStr += " " + toksplit; c.SetWord(toksplit); c.SetLemma(toksplit); c.SetValue(toksplit); c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit); c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok); if (setGoldClass) { c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); } if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label)) { c.Set(setClassForTheseLabels[label], label); } sent.Add(c); } } } } ICoreMap sentcm = new ArrayCoreMap(); sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim()); sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent); sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum); sentences.Add(sentcm); } } return(sentences); }
protected internal virtual ICollection <string> Clauses(string conll) { IList <CoreLabel> sentence = new List <CoreLabel>(); SemanticGraph tree = new SemanticGraph(); foreach (string line in conll.Split("\n")) { if (line.Trim().Equals(string.Empty)) { continue; } string[] fields = line.Trim().Split("\\s+"); int index = System.Convert.ToInt32(fields[0]); string word = fields[1]; CoreLabel label = MkWord(word, index); sentence.Add(label); if (fields[2].Equals("0")) { tree.AddRoot(new IndexedWord(label)); } else { tree.AddVertex(new IndexedWord(label)); } if (fields.Length > 4) { label.SetTag(fields[4]); } if (fields.Length > 5) { label.SetNER(fields[5]); } if (fields.Length > 6) { label.SetLemma(fields[6]); } } int i = 0; foreach (string line_1 in conll.Split("\n")) { if (line_1.Trim().Equals(string.Empty)) { continue; } string[] fields = line_1.Trim().Split("\\s+"); int parent = System.Convert.ToInt32(fields[2]); string reln = fields[3]; if (parent > 0) { tree.AddEdge(new IndexedWord(sentence[parent - 1]), new IndexedWord(sentence[i]), new GrammaticalRelation(Language.English, reln, null, null), 1.0, false); } i += 1; } // Run extractor ClauseSplitterSearchProblem problem = new ClauseSplitterSearchProblem(tree, true); ICollection <string> clauses = new HashSet <string>(); problem.Search(null, new LinearClassifier <ClauseSplitter.ClauseClassifierLabel, string>(new ClassicCounter <Pair <string, ClauseSplitter.ClauseClassifierLabel> >()), ClauseSplitterSearchProblem.HardSplits, null, 100000); return(clauses); }