/** * Writes a presentable version of the given PTB-tokenized text. * PTB tokenization splits up punctuation and does various other things * that makes simply joining the tokens with spaces look bad. So join * the tokens with space and run it through this method to produce nice * looking text. It's not perfect, but it works pretty well. */ public static int ptb2Text(TextReader ptbText, TextWriter w) /*throws IOException*/ { int numTokens = 0; PTB2TextLexer lexer = new PTB2TextLexer(ptbText); for (String token; (token = lexer.next()) != null;) { numTokens++; w.Write(token); } return(numTokens); }
/// <summary>Writes a presentable version of the given PTB-tokenized text.</summary> /// <remarks> /// Writes a presentable version of the given PTB-tokenized text. /// PTB tokenization splits up punctuation and does various other things /// that makes simply joining the tokens with spaces look bad. So join /// the tokens with space and run it through this method to produce nice /// looking text. It's not perfect, but it works pretty well. /// </remarks> /// <exception cref="System.IO.IOException"/> public static int Ptb2Text(Reader ptbText, TextWriter w) { int numTokens = 0; PTB2TextLexer lexer = new PTB2TextLexer(ptbText); for (string token; (token = lexer.Next()) != null;) { numTokens++; w.Write(token); } return(numTokens); }
/** * Returns a presentable version of the given PTB-tokenized text. * PTB tokenization splits up punctuation and does various other things * that makes simply joining the tokens with spaces look bad. So join * the tokens with space and run it through this method to produce nice * looking text. It's not perfect, but it works pretty well. * * @param ptbText A String in PTB3-escaped form * @return An approximation to the original String */ public static String ptb2Text(String ptbText) { StringBuilder sb = new StringBuilder(ptbText.Length); // probably an overestimate PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText)); try { for (String token; (token = lexer.next()) != null;) { sb.Append(token); } } catch (IOException e) { //e.printStackTrace(); } return(sb.ToString()); }
/// <summary>Returns a presentable version of the given PTB-tokenized text.</summary> /// <remarks> /// Returns a presentable version of the given PTB-tokenized text. /// PTB tokenization splits up punctuation and does various other things /// that makes simply joining the tokens with spaces look bad. So join /// the tokens with space and run it through this method to produce nice /// looking text. It's not perfect, but it works pretty well. /// <p> /// <b>Note:</b> If your tokens have maintained the OriginalTextAnnotation and /// the BeforeAnnotation and the AfterAnnotation, then rather than doing /// this you can actually precisely reconstruct the text they were made /// from! /// </remarks> /// <param name="ptbText">A String in PTB3-escaped form</param> /// <returns>An approximation to the original String</returns> public static string Ptb2Text(string ptbText) { StringBuilder sb = new StringBuilder(ptbText.Length); // probably an overestimate PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText)); try { for (string token; (token = lexer.Next()) != null;) { sb.Append(token); } } catch (IOException e) { throw new RuntimeIOException(e); } return(sb.ToString()); }