/// <summary>
        /// Read parse trees from a Reader.
        /// </summary>
        /// <param name="input">The Reader</param>
        /// <param name="tf">TreeFactory -- factory to create some kind of Tree</param>
        /// <param name="tn">the method of normalizing trees</param>
        /// <param name="st">Tokenizer that divides up Reader</param>
        public PennTreeReader(TextReader input, ITreeFactory tf, TreeNormalizer tn, ITokenizer <string> st)
        {
            reader         = input;
            treeFactory    = tf;
            treeNormalizer = tn;
            tokenizer      = st;

            // check for whacked out headers still present in Brown corpus in Treebank 3
            string first = (st.HasNext() ? st.Peek() : null);

            if (first != null && first.StartsWith("*x*x*x"))
            {
                int foundCount = 0;
                while (foundCount < 4 && st.HasNext())
                {
                    first = st.Next();
                    if (first != null && first.StartsWith("*x*x*x"))
                    {
                        foundCount++;
                    }
                }
            }
        }
        /// <summary>
        /// Reads a single tree in standard Penn Treebank format from the
        /// input stream. The method supports additional parentheses around the
        /// tree (an unnamed ROOT node) so long as they are balanced. If the token stream
        /// ends before the current tree is complete, then the method will throw an
        /// <code>IOException</code>.
        ///
        /// Note that the method will skip malformed trees and attempt to
        /// read additional trees from the input stream. It is possible, however,
        /// that a malformed tree will corrupt the token stream. In this case,
        /// an <code>IOException</code> will eventually be thrown.
        /// </summary>
        /// <returns>A single tree, or <code>null</code> at end of token stream.</returns>
        public Tree ReadTree()
        {
            Tree t = null;

            while (tokenizer.HasNext() && t == null)
            {
                //Setup PDA
                this.currentTree = null;
                this.stack       = new List <Tree>();

                try
                {
                    t = GetTreeFromInputStream();
                }
                catch (Exception e)
                {
                    throw new IOException("End of token stream encountered before parsing could complete.");
                }

                if (t != null)
                {
                    // cdm 20100618: Don't do this!  This was never the historical behavior!!!
                    // Escape empty trees e.g. (())
                    // while(t != null && (t.value() == null || t.value().equals("")) && t.numChildren() <= 1)
                    //   t = t.firstChild();

                    if (treeNormalizer != null && treeFactory != null)
                    {
                        t = treeNormalizer.NormalizeWholeTree(t, treeFactory);
                    }
                    t.IndexLeaves(true);
                }
            }

            return(t);
        }
Exemple #3
0
        /// <summary>
        /// Read parse trees from a Reader.
        /// </summary>
        /// <param name="input">The Reader</param>
        /// <param name="tf">TreeFactory -- factory to create some kind of Tree</param>
        /// <param name="tn">the method of normalizing trees</param>
        /// <param name="st">Tokenizer that divides up Reader</param>
        public PennTreeReader(TextReader input, ITreeFactory tf, TreeNormalizer tn, ITokenizer<string> st)
        {
            reader = input;
            treeFactory = tf;
            treeNormalizer = tn;
            tokenizer = st;

            // check for whacked out headers still present in Brown corpus in Treebank 3
            string first = (st.HasNext() ? st.Peek() : null);
            if (first != null && first.StartsWith("*x*x*x"))
            {
                int foundCount = 0;
                while (foundCount < 4 && st.HasNext())
                {
                    first = st.Next();
                    if (first != null && first.StartsWith("*x*x*x"))
                    {
                        foundCount++;
                    }
                }
            }
        }