public virtual void TestRead() { string treeText = "(1 (2 This)) (3 (4 is) (5 a)) (6 (\\* small) (7 \\/test))"; StringReader reader = new StringReader(treeText); PennTreeReader treeReader = new PennTreeReader(reader); string[] expected = new string[] { "(1 (2 This))", "(3 (4 is) (5 a))", "(6 (* small) (7 /test))" }; for (int i = 0; i < expected.Length; ++i) { Tree tree = treeReader.ReadTree(); NUnit.Framework.Assert.IsTrue(tree != null); NUnit.Framework.Assert.AreEqual(expected[i], tree.ToString()); } Tree tree_1 = treeReader.ReadTree(); NUnit.Framework.Assert.IsFalse(tree_1 != null); }
// We don't use valueOf because we sometimes use trees such as // (bar (foo (foo 1))), and the default valueOf uses a // TreeNormalizer that removes nodes from such a tree public static Tree TreeFromString(string s) { try { ITreeReader tr = new PennTreeReader(new StringReader(s), new LabeledScoredTreeFactory()); return(tr.ReadTree()); } catch (IOException e) { throw new Exception(e); } }
/// <summary>Process all the trees in the given directory.</summary> /// <remarks>Process all the trees in the given directory. For example, the WSJ section of the Penn Treebank.</remarks> /// <param name="name">The name of the directory we are processing.</param> /// <param name="directory">The directory we are processing.</param> /// <returns> /// A dataset of subject/object pairs in the trees in the directory. /// This is a list of sentences, such that each sentence has a collection of pairs of spans. /// Each pair of spans is a subject/object span pair that constitutes a valid extraction. /// </returns> /// <exception cref="System.IO.IOException"/> private static IList <Pair <ICoreMap, ICollection <Pair <Span, Span> > > > ProcessDirectory(string name, File directory) { Redwood.Util.ForceTrack("Processing " + name); // Prepare the files to iterate over IEnumerable <File> files = IOUtils.IterFilesRecursive(directory, "mrg"); int numTreesProcessed = 0; IList <Pair <ICoreMap, ICollection <Pair <Span, Span> > > > trainingData = new List <Pair <ICoreMap, ICollection <Pair <Span, Span> > > >(1024); // Iterate over the files foreach (File file in files) { // log(file); ITreeReader reader = new PennTreeReader(IOUtils.ReaderFromFile(file)); Tree tree; while ((tree = reader.ReadTree()) != null) { try { // Prepare the tree tree.IndexSpans(); tree.SetSpans(); // Get relevant information from sentence IList <CoreLabel> tokens = tree.GetLeaves().Stream().Map(null).Collect(Collectors.ToList()); // .filter(leaf -> !TRACE_SOURCE_PATTERN.matcher(leaf.word()).matches() && !leaf.tag().equals("-NONE-")) SemanticGraph graph = Parse(tree); IDictionary <int, Span> targets = FindTraceTargets(tree); IDictionary <int, int> sources = FindTraceSources(tree); // Create a sentence object ICoreMap sentence = new _ArrayCoreMap_325(tokens, graph, 4); natlog.DoOneSentence(null, sentence); // Generate training data ICollection <Pair <Span, Span> > trainingDataFromSentence = SubjectObjectPairs(graph, tokens, targets, sources); trainingData.Add(Pair.MakePair(sentence, trainingDataFromSentence)); // Debug print numTreesProcessed += 1; if (numTreesProcessed % 100 == 0) { Redwood.Util.Log("[" + new DecimalFormat("00000").Format(numTreesProcessed) + "] " + CountDatums(trainingData) + " known extractions"); } } catch (Exception t) { Sharpen.Runtime.PrintStackTrace(t); } } } // End Redwood.Util.Log(string.Empty + numTreesProcessed + " trees processed yielding " + CountDatums(trainingData) + " known extractions"); Redwood.Util.EndTrack("Processing " + name); return(trainingData); }