private static int tokReader(TextReader r, TextWriter writer, Regex parseInsidePattern, String options, bool preserveLines, bool dump, bool lowerCase) /*throws IOException*/ { int numTokens = 0; bool beginLine = true; bool printing = (parseInsidePattern == null); // start off printing, unless you're looking for a start entity Matcher m = null; if (parseInsidePattern != null) { m = parseInsidePattern.matcher(""); // create once as performance hack } for (PTBTokenizer <CoreLabel> tokenizer = new PTBTokenizer <CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext();) { CoreLabel obj = tokenizer.next(); // String origStr = obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", ""); // DanC added this to fix a lexer bug, hopefully now corrected String origStr = (String)obj.get(typeof(CoreAnnotations.TextAnnotation)); String str; if (lowerCase) { str = origStr.ToLower(Locale.ENGLISH); obj.set(typeof(CoreAnnotations.TextAnnotation), str); } else { str = origStr; } if (m != null && m.reset(origStr).matches()) { printing = m.group(1).isEmpty(); // turn on printing if no end element slash, turn it off it there is } else if (printing) { if (dump) { // after having checked for tags, change str to be exhaustive str = obj.toString(); } if (preserveLines) { if (PTBLexer.NEWLINE_TOKEN.equals(origStr)) { beginLine = true; writer.newLine(); } else { if (!beginLine) { writer.Write(' '); } else { beginLine = false; } // writer.Write(str.replace("\n", "")); writer.Write(str); } } else { writer.Write(str); writer.newLine(); } } numTokens++; } return(numTokens); }