示例#1
0
        /**
         * Writes a presentable version of the given PTB-tokenized text.
         * PTB tokenization splits up punctuation and does various other things
         * that makes simply joining the tokens with spaces look bad. So join
         * the tokens with space and run it through this method to produce nice
         * looking text. It's not perfect, but it works pretty well.
         */
        public static int ptb2Text(TextReader ptbText, TextWriter w) /*throws IOException*/
        {
            int           numTokens = 0;
            PTB2TextLexer lexer     = new PTB2TextLexer(ptbText);

            for (String token; (token = lexer.next()) != null;)
            {
                numTokens++;
                w.Write(token);
            }
            return(numTokens);
        }
        /// <summary>Writes a presentable version of the given PTB-tokenized text.</summary>
        /// <remarks>
        /// Writes a presentable version of the given PTB-tokenized text.
        /// PTB tokenization splits up punctuation and does various other things
        /// that makes simply joining the tokens with spaces look bad. So join
        /// the tokens with space and run it through this method to produce nice
        /// looking text. It's not perfect, but it works pretty well.
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        public static int Ptb2Text(Reader ptbText, TextWriter w)
        {
            int           numTokens = 0;
            PTB2TextLexer lexer     = new PTB2TextLexer(ptbText);

            for (string token; (token = lexer.Next()) != null;)
            {
                numTokens++;
                w.Write(token);
            }
            return(numTokens);
        }
示例#3
0
        /**
         * Returns a presentable version of the given PTB-tokenized text.
         * PTB tokenization splits up punctuation and does various other things
         * that makes simply joining the tokens with spaces look bad. So join
         * the tokens with space and run it through this method to produce nice
         * looking text. It's not perfect, but it works pretty well.
         *
         * @param ptbText A String in PTB3-escaped form
         * @return An approximation to the original String
         */
        public static String ptb2Text(String ptbText)
        {
            StringBuilder sb    = new StringBuilder(ptbText.Length); // probably an overestimate
            PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));

            try {
                for (String token; (token = lexer.next()) != null;)
                {
                    sb.Append(token);
                }
            } catch (IOException e) {
                //e.printStackTrace();
            }
            return(sb.ToString());
        }
        /// <summary>Returns a presentable version of the given PTB-tokenized text.</summary>
        /// <remarks>
        /// Returns a presentable version of the given PTB-tokenized text.
        /// PTB tokenization splits up punctuation and does various other things
        /// that makes simply joining the tokens with spaces look bad. So join
        /// the tokens with space and run it through this method to produce nice
        /// looking text. It's not perfect, but it works pretty well.
        /// <p>
        /// <b>Note:</b> If your tokens have maintained the OriginalTextAnnotation and
        /// the BeforeAnnotation and the AfterAnnotation, then rather than doing
        /// this you can actually precisely reconstruct the text they were made
        /// from!
        /// </remarks>
        /// <param name="ptbText">A String in PTB3-escaped form</param>
        /// <returns>An approximation to the original String</returns>
        public static string Ptb2Text(string ptbText)
        {
            StringBuilder sb = new StringBuilder(ptbText.Length);
            // probably an overestimate
            PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));

            try
            {
                for (string token; (token = lexer.Next()) != null;)
                {
                    sb.Append(token);
                }
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            return(sb.ToString());
        }