/// <summary> /// Read parse trees from a Reader. /// </summary> /// <param name="input">The Reader</param> /// <param name="tf">TreeFactory -- factory to create some kind of Tree</param> /// <param name="tn">the method of normalizing trees</param> /// <param name="st">Tokenizer that divides up Reader</param> public PennTreeReader(TextReader input, ITreeFactory tf, TreeNormalizer tn, ITokenizer <string> st) { reader = input; treeFactory = tf; treeNormalizer = tn; tokenizer = st; // check for whacked out headers still present in Brown corpus in Treebank 3 string first = (st.HasNext() ? st.Peek() : null); if (first != null && first.StartsWith("*x*x*x")) { int foundCount = 0; while (foundCount < 4 && st.HasNext()) { first = st.Next(); if (first != null && first.StartsWith("*x*x*x")) { foundCount++; } } } }
/// <summary> /// Reads a single tree in standard Penn Treebank format from the /// input stream. The method supports additional parentheses around the /// tree (an unnamed ROOT node) so long as they are balanced. If the token stream /// ends before the current tree is complete, then the method will throw an /// <code>IOException</code>. /// /// Note that the method will skip malformed trees and attempt to /// read additional trees from the input stream. It is possible, however, /// that a malformed tree will corrupt the token stream. In this case, /// an <code>IOException</code> will eventually be thrown. /// </summary> /// <returns>A single tree, or <code>null</code> at end of token stream.</returns> public Tree ReadTree() { Tree t = null; while (tokenizer.HasNext() && t == null) { //Setup PDA this.currentTree = null; this.stack = new List <Tree>(); try { t = GetTreeFromInputStream(); } catch (Exception e) { throw new IOException("End of token stream encountered before parsing could complete."); } if (t != null) { // cdm 20100618: Don't do this! This was never the historical behavior!!! // Escape empty trees e.g. (()) // while(t != null && (t.value() == null || t.value().equals("")) && t.numChildren() <= 1) // t = t.firstChild(); if (treeNormalizer != null && treeFactory != null) { t = treeNormalizer.NormalizeWholeTree(t, treeFactory); } t.IndexLeaves(true); } } return(t); }
/// <summary> /// Read parse trees from a Reader. /// </summary> /// <param name="input">The Reader</param> /// <param name="tf">TreeFactory -- factory to create some kind of Tree</param> /// <param name="tn">the method of normalizing trees</param> /// <param name="st">Tokenizer that divides up Reader</param> public PennTreeReader(TextReader input, ITreeFactory tf, TreeNormalizer tn, ITokenizer<string> st) { reader = input; treeFactory = tf; treeNormalizer = tn; tokenizer = st; // check for whacked out headers still present in Brown corpus in Treebank 3 string first = (st.HasNext() ? st.Peek() : null); if (first != null && first.StartsWith("*x*x*x")) { int foundCount = 0; while (foundCount < 4 && st.HasNext()) { first = st.Next(); if (first != null && first.StartsWith("*x*x*x")) { foundCount++; } } } }