// We don't use valueOf because we sometimes use trees such as // (bar (foo (foo 1))), and the default valueOf uses a // TreeNormalizer that removes nodes from such a tree public static Tree TreeFromString(string s) { try { ITreeReader tr = new PennTreeReader(new StringReader(s), new LabeledScoredTreeFactory()); return(tr.ReadTree()); } catch (IOException e) { throw new Exception(e); } }
/// <summary>Process all the trees in the given directory.</summary> /// <remarks>Process all the trees in the given directory. For example, the WSJ section of the Penn Treebank.</remarks> /// <param name="name">The name of the directory we are processing.</param> /// <param name="directory">The directory we are processing.</param> /// <returns> /// A dataset of subject/object pairs in the trees in the directory. /// This is a list of sentences, such that each sentence has a collection of pairs of spans. /// Each pair of spans is a subject/object span pair that constitutes a valid extraction. /// </returns> /// <exception cref="System.IO.IOException"/> private static IList <Pair <ICoreMap, ICollection <Pair <Span, Span> > > > ProcessDirectory(string name, File directory) { Redwood.Util.ForceTrack("Processing " + name); // Prepare the files to iterate over IEnumerable <File> files = IOUtils.IterFilesRecursive(directory, "mrg"); int numTreesProcessed = 0; IList <Pair <ICoreMap, ICollection <Pair <Span, Span> > > > trainingData = new List <Pair <ICoreMap, ICollection <Pair <Span, Span> > > >(1024); // Iterate over the files foreach (File file in files) { // log(file); ITreeReader reader = new PennTreeReader(IOUtils.ReaderFromFile(file)); Tree tree; while ((tree = reader.ReadTree()) != null) { try { // Prepare the tree tree.IndexSpans(); tree.SetSpans(); // Get relevant information from sentence IList <CoreLabel> tokens = tree.GetLeaves().Stream().Map(null).Collect(Collectors.ToList()); // .filter(leaf -> !TRACE_SOURCE_PATTERN.matcher(leaf.word()).matches() && !leaf.tag().equals("-NONE-")) SemanticGraph graph = Parse(tree); IDictionary <int, Span> targets = FindTraceTargets(tree); IDictionary <int, int> sources = FindTraceSources(tree); // Create a sentence object ICoreMap sentence = new _ArrayCoreMap_325(tokens, graph, 4); natlog.DoOneSentence(null, sentence); // Generate training data ICollection <Pair <Span, Span> > trainingDataFromSentence = SubjectObjectPairs(graph, tokens, targets, sources); trainingData.Add(Pair.MakePair(sentence, trainingDataFromSentence)); // Debug print numTreesProcessed += 1; if (numTreesProcessed % 100 == 0) { Redwood.Util.Log("[" + new DecimalFormat("00000").Format(numTreesProcessed) + "] " + CountDatums(trainingData) + " known extractions"); } } catch (Exception t) { Sharpen.Runtime.PrintStackTrace(t); } } } // End Redwood.Util.Log(string.Empty + numTreesProcessed + " trees processed yielding " + CountDatums(trainingData) + " known extractions"); Redwood.Util.EndTrack("Processing " + name); return(trainingData); }
private static SemanticGraph MakeGraph() { Tree tree; try { tree = new PennTreeReader(new StringReader("(S1 (S (S (S (NP (DT The) (NN CD14) (NN LPS) (NN receptor)) (VP (VBZ is) (, ,) (ADVP (RB however)) (, ,) (ADVP (RB up)) (VP (VBN regulated) (PRN (-LRB- -LRB-) (FRAG (RB not) (ADJP (RB down) (VBN regulated))) (-RRB- -RRB-)) (PP (IN in) (NP (JJ tolerant) (NNS cells)))))) (, ,) (CC and) (S (NP (NN LPS)) (VP (MD can) (, ,) (PP (IN in) (NP (NN fact))) (, ,) (ADVP (RB still)) (VP (VB lead) (PP (TO to) (NP (NP (NN activation)) (PP (IN of) (NP (JJ tolerant) (NNS cells))))) (SBAR (IN as) (S (VP (VBN evidenced) (PP (IN by) (NP (NP (NN mobilization)) (PP (IN of) (NP (DT the) (NN transcription) (NN factor) (NP (NP (JJ nuclear) (NN factor) (NN kappa) (NN B)) (PRN (-LRB- -LRB-) (NP (NN NF-kappa) (NN B)) (-RRB- -RRB-)))))))))))))) (. .)))" ), new LabeledScoredTreeFactory()).ReadTree(); } catch (IOException e) { // the tree should parse correctly throw new Exception(e); } return(SemanticGraphFactory.MakeFromTree(tree, SemanticGraphFactory.Mode.Basic, GrammaticalStructure.Extras.Maximal)); }
public virtual void TestRead() { string treeText = "(1 (2 This)) (3 (4 is) (5 a)) (6 (\\* small) (7 \\/test))"; StringReader reader = new StringReader(treeText); PennTreeReader treeReader = new PennTreeReader(reader); string[] expected = new string[] { "(1 (2 This))", "(3 (4 is) (5 a))", "(6 (* small) (7 /test))" }; for (int i = 0; i < expected.Length; ++i) { Tree tree = treeReader.ReadTree(); NUnit.Framework.Assert.IsTrue(tree != null); NUnit.Framework.Assert.AreEqual(expected[i], tree.ToString()); } Tree tree_1 = treeReader.ReadTree(); NUnit.Framework.Assert.IsFalse(tree_1 != null); }
public virtual ITreeReader NewTreeReader(Reader @in) { ITreeReader tr = null; if (noNormalization) { tr = new PennTreeReader(@in, new LabeledScoredTreeFactory(), new TreeNormalizer(), new ArabicTreebankTokenizer(@in)); } else { tr = new PennTreeReader(@in, new LabeledScoredTreeFactory(), new ArabicTreeNormalizer(retainNPTmp, retainPRD, changeNoLabels, retainNPSbj, retainPPClr), new ArabicTreebankTokenizer(@in)); } if (filterX) { tr = new FilteringTreeReader(tr, new ArabicTreeReaderFactory.XFilter()); } return(tr); }
/// <exception cref="System.IO.IOException"/> public override Pair <Annotation, InputStream> Read(InputStream @is) { if (compress && !(@is is GZIPInputStream)) { @is = new GZIPInputStream(@is); } BufferedReader reader = new BufferedReader(new InputStreamReader(@is)); Annotation doc = new Annotation(string.Empty); string line; // read the coref graph (new format) IDictionary <int, CorefChain> chains = LoadCorefChains(reader); if (chains != null) { doc.Set(typeof(CorefCoreAnnotations.CorefChainAnnotation), chains); } // read the coref graph (old format) line = reader.ReadLine().Trim(); if (line.Length > 0) { string[] bits = line.Split(" "); if (bits.Length % 4 != 0) { throw new RuntimeIOException("ERROR: Incorrect format for the serialized coref graph: " + line); } IList <Pair <IntTuple, IntTuple> > corefGraph = new List <Pair <IntTuple, IntTuple> >(); for (int i = 0; i < bits.Length; i += 4) { IntTuple src = new IntTuple(2); IntTuple dst = new IntTuple(2); src.Set(0, System.Convert.ToInt32(bits[i])); src.Set(1, System.Convert.ToInt32(bits[i + 1])); dst.Set(0, System.Convert.ToInt32(bits[i + 2])); dst.Set(1, System.Convert.ToInt32(bits[i + 3])); corefGraph.Add(new Pair <IntTuple, IntTuple>(src, dst)); } doc.Set(typeof(CorefCoreAnnotations.CorefGraphAnnotation), corefGraph); } // read individual sentences IList <ICoreMap> sentences = new List <ICoreMap>(); while ((line = reader.ReadLine()) != null) { ICoreMap sentence = new Annotation(string.Empty); // first line is the parse tree. construct it with CoreLabels in Tree nodes Tree tree = new PennTreeReader(new StringReader(line), new LabeledScoredTreeFactory(CoreLabel.Factory())).ReadTree(); sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree); // read the dependency graphs AnnotationSerializer.IntermediateSemanticGraph intermCollapsedDeps = LoadDependencyGraph(reader); AnnotationSerializer.IntermediateSemanticGraph intermUncollapsedDeps = LoadDependencyGraph(reader); AnnotationSerializer.IntermediateSemanticGraph intermCcDeps = LoadDependencyGraph(reader); // the remaining lines until empty line are tokens IList <CoreLabel> tokens = new List <CoreLabel>(); while ((line = reader.ReadLine()) != null) { if (line.Length == 0) { break; } CoreLabel token = LoadToken(line, haveExplicitAntecedent); tokens.Add(token); } sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); // convert the intermediate graph to an actual SemanticGraph SemanticGraph collapsedDeps = intermCollapsedDeps.ConvertIntermediateGraph(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), collapsedDeps); SemanticGraph uncollapsedDeps = intermUncollapsedDeps.ConvertIntermediateGraph(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), uncollapsedDeps); SemanticGraph ccDeps = intermCcDeps.ConvertIntermediateGraph(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), ccDeps); sentences.Add(sentence); } doc.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences); return(Pair.MakePair(doc, @is)); }