/// <summary> /// Reads a single tree in standard Penn Treebank format from the /// input stream. /// </summary> /// <remarks> /// Reads a single tree in standard Penn Treebank format from the /// input stream. The method supports additional parentheses around the /// tree (an unnamed ROOT node) so long as they are balanced. If the token stream /// ends before the current tree is complete, then the method will throw an /// <code>IOException</code>. /// <p> /// Note that the method will skip malformed trees and attempt to /// read additional trees from the input stream. It is possible, however, /// that a malformed tree will corrupt the token stream. In this case, /// an <code>IOException</code> will eventually be thrown. /// </remarks> /// <returns>A single tree, or <code>null</code> at end of token stream.</returns> /// <exception cref="System.IO.IOException"/> public virtual Tree ReadTree() { Tree t = null; while (tokenizer.MoveNext() && t == null) { //Setup PDA this.currentTree = null; this.stack = new List <Tree>(); try { t = GetTreeFromInputStream(); } catch (NoSuchElementException) { throw new IOException("End of token stream encountered before parsing could complete."); } if (t != null) { // cdm 20100618: Don't do this! This was never the historical behavior!!! // Escape empty trees e.g. (()) // while(t != null && (t.value() == null || t.value().equals("")) && t.numChildren() <= 1) // t = t.firstChild(); if (treeNormalizer != null && treeFactory != null) { t = treeNormalizer.NormalizeWholeTree(t, treeFactory); } if (t != null) { t.IndexLeaves(true); } } } return(t); }
/// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary> /// <remarks> /// A fast, rule-based tokenizer for Modern Standard French. /// Performs punctuation splitting and light tokenization by default. /// <p> /// Currently, this tokenizer does not do line splitting. It assumes that the input /// file is delimited by the system line separator. The output will be equivalently /// delimited. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Lexer options ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory(); string orthoOptions = options.GetProperty("options", string.Empty); // When called from this main method, split on newline. No options for // more granular sentence splitting. orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; tf.SetOptions(orthoOptions); // Other options string encoding = options.GetProperty("encoding", "UTF-8"); bool toLower = PropertiesUtils.GetBool(options, "lowerCase", false); // Read the file from stdin int nLines = 0; int nTokens = 0; long startTime = Runtime.NanoTime(); try { ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(FrenchLexer.NewlineToken)) { ++nLines; printSpace = false; System.Console.Out.WriteLine(); } else { if (printSpace) { System.Console.Out.Write(" "); } string outputToken = toLower ? word.ToLower(Locale.French) : word; System.Console.Out.Write(outputToken); printSpace = true; } } } catch (UnsupportedEncodingException e) { log.Error(e); } long elapsedTime = Runtime.NanoTime() - startTime; double linesPerSec = (double)nLines / (elapsedTime / 1e9); System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec); }
/// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary> /// <remarks> /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding). /// Performs punctuation splitting and light tokenization by default. /// Orthographic normalization options are available, and can be enabled with /// command line options. /// <p> /// Currently, this tokenizer does not do line splitting. It normalizes non-printing /// line separators across platforms and prints the system default line splitter /// to the output. /// <p> /// The following normalization options are provided: /// <ul> /// <li> /// <c>useUTF8Ellipsis</c> /// : Replaces sequences of three or more full stops with \u2026</li> /// <li> /// <c>normArDigits</c> /// : Convert Arabic digits to ASCII equivalents</li> /// <li> /// <c>normArPunc</c> /// : Convert Arabic punctuation to ASCII equivalents</li> /// <li> /// <c>normAlif</c> /// : Change all alif forms to bare alif</li> /// <li> /// <c>normYa</c> /// : Map ya to alif maqsura</li> /// <li> /// <c>removeDiacritics</c> /// : Strip all diacritics</li> /// <li> /// <c>removeTatweel</c> /// : Strip tatweel elongation character</li> /// <li> /// <c>removeQuranChars</c> /// : Remove diacritics that appear in the Quran</li> /// <li> /// <c>removeProMarker</c> /// : Remove the ATB null pronoun marker</li> /// <li> /// <c>removeSegMarker</c> /// : Remove the ATB clitic segmentation marker</li> /// <li> /// <c>removeMorphMarker</c> /// : Remove the ATB morpheme boundary markers</li> /// <li> /// <c>removeLengthening</c> /// : Replace all sequences of three or more identical (non-period) characters with one copy</li> /// <li> /// <c>atbEscaping</c> /// : Replace left/right parentheses with ATB escape characters</li> /// </ul> /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length > 0 && args[0].Contains("help")) { System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName); System.Console.Error.Printf("%nOptions:%n"); log.Info(" -help : Print this message. See javadocs for all normalization options."); log.Info(" -atb : Tokenization for the parsing experiments in Green and Manning (2010)"); System.Environment.Exit(-1); } // Process normalization options Properties tokenizerOptions = StringUtils.ArgsToProperties(args); ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory(); foreach (string option in tokenizerOptions.StringPropertyNames()) { tf.SetOptions(option); } // Replace line separators with a token so that we can // count lines tf.SetOptions("tokenizeNLs"); // Read the file int nLines = 0; int nTokens = 0; try { string encoding = "UTF-8"; ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(ArabicLexer.NewlineToken)) { ++nLines; printSpace = false; System.Console.Out.WriteLine(); } else { if (printSpace) { System.Console.Out.Write(" "); } System.Console.Out.Write(word); printSpace = true; } } } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens); }
private YAnchor GetAnchorValueDependent(ITokenizer tokenizer) { switch (tokenizer.Current.Value.Kind) { // check "name : anchor" case TokenKind.Indent when tokenizer.Current.Next?.Next?.Next?.Value.Kind == TokenKind.Anchor: tokenizer.MoveNext(); tokenizer.MoveNext(); tokenizer.MoveNext(); var name = tokenizer.Current.Value.Value; tokenizer.MoveNext(); var anchor = new YAnchor(this.GetNodeValue(tokenizer)); tokenizer.Anchors[name] = anchor; return(anchor); default: return(null); } }
private YScalar GetScalarValueDependent(ITokenizer tokenizer) { switch (tokenizer.Current.Value.Kind) { case TokenKind.StringDouble: case TokenKind.StringSingle: case TokenKind.StringFolding: case TokenKind.StringLiteral: { var kind = tokenizer.Current.Value.Kind; var value = tokenizer.Current.Value.Value; if (tokenizer.Current.Value.Kind == TokenKind.StringDouble) { value = YScalar.UnescapeString(value); } tokenizer.MoveNext(); var style = kind == TokenKind.StringFolding || kind == TokenKind.StringLiteral ? YNodeStyle.Block : YNodeStyle.Flow; return(new YScalar(value, style)); } case TokenKind.StringPlain: { var value = tokenizer.Current.Value.Value; tokenizer.MoveNext(); if (string.IsNullOrEmpty(value)) { return(new YScalar(null)); } return(value.Equals("null", StringComparison.OrdinalIgnoreCase) ? new YScalar(null) : new YScalar(value)); } default: return(null); } }
private INode ParseMultiplyDivide() { var left = ParseUnary(); while (true) { Func <double, double, double> operation = _tokenizer.Token switch { Token.Multiply => ((a, b) => a * b), Token.Divide => ((a, b) => a / b), _ => null }; if (operation == null) { return(left); } _tokenizer.MoveNext(); var right = ParseUnary(); left = new BinaryOperation(left, right, operation); } }
/// <summary>Read parse trees from a Reader.</summary> /// <param name="in">Reader</param> /// <param name="tf">TreeFactory -- factory to create some kind of Tree</param> /// <param name="tn">the method of normalizing trees</param> /// <param name="st">Tokenizer that divides up Reader</param> public PennTreeReader(Reader @in, ITreeFactory tf, TreeNormalizer tn, ITokenizer <string> st) { // misuse a list as a stack, since we want to avoid the synchronized and old Stack, but don't need the power and JDK 1.6 dependency of a Deque reader = @in; treeFactory = tf; treeNormalizer = tn; tokenizer = st; // check for whacked out headers still present in Brown corpus in Treebank 3 string first = (st.MoveNext() ? st.Peek() : null); if (first != null && first.StartsWith("*x*x*x")) { int foundCount = 0; while (foundCount < 4 && st.MoveNext()) { first = st.Current; if (first != null && first.StartsWith("*x*x*x")) { foundCount++; } } } }
private YKeyValuePair ParseMappingKey(ITokenizer tokenizer) { switch (tokenizer.Current.Value.Kind) { case TokenKind.MappingKey: { tokenizer.MoveNext(); var key = this.GetNodeKey(tokenizer); if (tokenizer.Current.Value.Kind != TokenKind.MappingValue) { return(new YKeyValuePair(key, new YScalar(null))); } tokenizer.MoveNext(); var keyValuePair = new YKeyValuePair(key); var value = this.GetNodeValue(tokenizer); keyValuePair.Value = value; return(keyValuePair); } default: { var key = this.GetNodeKey(tokenizer); tokenizer.MoveNext(); var keyValuePair = new YKeyValuePair(key); var value = this.GetNodeValue(tokenizer); keyValuePair.Value = value; return(keyValuePair); } } }
public ParserTree ParseStringToTree(string input) { tokenizer = new BTokenizer(input); while (tokenizer.MoveNext()) { string currentToken = tokenizer.Current; _internalStack.DoPush(new TerminalNode(currentToken)); while (Reduce()) { } } ParserTree tree = new ParserTree(_grammar, _internalStack); return tree; }
private YDocument GetDocumentValueDependent(ITokenizer tokenizer) { if (tokenizer.Current.Value.Kind != TokenKind.Document) { return(null); } tokenizer.MoveNext(); var items = new List <YNode>(); while (tokenizer.Current.Value.Kind != TokenKind.Document && tokenizer.Current.Value.Kind != TokenKind.Eof) { items.Add(this.GetNodeValue(tokenizer)); } return(new YDocument(YNodeStyle.Block, items.ToArray())); }
private YAlias GetAliasValueDependent(ITokenizer tokenizer) { if (tokenizer.Current.Value.Kind != TokenKind.Alias) { return(null); } var anchorName = tokenizer.Current.Value.Value; if (!tokenizer.Anchors.ContainsKey(anchorName)) { throw ParseException.Tokenizer(tokenizer, $"Not found anchorName: {anchorName}"); } var anchorValue = tokenizer.Anchors[anchorName]; tokenizer.MoveNext(); return(new YAlias(anchorName, anchorValue)); }
private YSequence GetSequenceValueDependent(ITokenizer tokenizer) { switch (tokenizer.Current.Value.Kind) { case TokenKind.Indent when tokenizer.Current.Next?.Value.Kind == TokenKind.SequenceValue: { var sequenceNode = new YSequence(YNodeStyle.Block); var items = new List <YNode>(); tokenizer.MoveNext(); while (tokenizer.Current.Value.Kind != TokenKind.Unindent && tokenizer.Current.Value.Kind != TokenKind.Eof) { if (tokenizer.Current.Value.Kind != TokenKind.SequenceValue) { throw ParseException.UnexpectedToken(tokenizer, TokenKind.SequenceValue); } tokenizer.MoveNext(); items.Add(this.GetNodeValue(tokenizer)); } if (tokenizer.Current.Value.Kind == TokenKind.Unindent) { tokenizer.MoveNext(); } sequenceNode.Add(items.ToArray()); return(sequenceNode); } case TokenKind.SequenceBegin: { var sequenceNode = new YSequence(YNodeStyle.Flow); var items = new List <YNode>(); tokenizer.MoveNext(); do { if (tokenizer.Current.Value.Kind == TokenKind.SequenceEnd) { break; } items.Add(this.GetNodeValue(tokenizer)); } while (tokenizer.Current.Value.Kind == TokenKind.ItemDelimiter && tokenizer.MoveNext()); if (tokenizer.Current.Value.Kind != TokenKind.SequenceEnd) { throw ParseException.UnexpectedToken(tokenizer, TokenKind.SequenceEnd); } tokenizer.MoveNext(); sequenceNode.Add(items.ToArray()); return(sequenceNode); } default: return(null); } }
/// <summary> /// usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] /// -file filename [-encoding encoding] /// <p> /// The -segmentIBM option is for IBM GALE-specific splitting of an /// XML element into sentences. /// </summary> /// <exception cref="System.Exception"/> public static void Main(string[] args) { //String encoding = "GB18030"; Properties props = StringUtils.ArgsToProperties(args); // log.info("Here are the properties:"); // props.list(System.err); bool alwaysAddS = props.Contains("alwaysAddS"); Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor cp; if (!props.Contains("file")) { log.Info("usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] -file filename [-encoding encoding]"); return; } cp = new Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor(); if (props.Contains("encoding")) { log.Info("WARNING: for now the default encoding is " + cp.encoding + ". It's not changeable for now"); } string input = IOUtils.SlurpFileNoExceptions(props.GetProperty("file"), cp.encoding); // String input = StringUtils.slurpGBURLNoExceptions(new URL(props.getProperty("file"))); if (props.Contains("segmentIBM")) { ITokenizer <Word> tok = WhitespaceTokenizer.NewWordWhitespaceTokenizer(new StringReader(input), true); string parseInside = props.GetProperty("parseInside"); if (parseInside == null) { parseInside = string.Empty; } Pattern p1; Pattern p2; Pattern p3; Pattern p4; PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, cp.encoding), true); StringBuilder buff = new StringBuilder(); StringBuilder sgmlbuff = new StringBuilder(); string lastSgml = string.Empty; p1 = Pattern.Compile("<.*>"); p2 = Pattern.Compile("\uFEFF?<[\\p{Alpha}]+"); p3 = Pattern.Compile("[A-Za-z0-9=\"]+>"); p4 = Pattern.Compile("<(?:" + parseInside + ")[ >]"); bool inSGML = false; int splitItems = 0; int numAdded = 0; while (tok.MoveNext()) { string s = tok.Current.Word(); // pw.println("The token is |" + s + "|"); if (p2.Matcher(s).Matches()) { inSGML = true; sgmlbuff.Append(s).Append(" "); } else { if (p1.Matcher(s).Matches() || inSGML && p3.Matcher(s).Matches() || "\n".Equals(s)) { inSGML = false; if (buff.ToString().Trim().Length > 0) { // pw.println("Dumping sentences"); // pw.println("Buff is " + buff); bool processIt = false; if (parseInside.Equals(string.Empty)) { processIt = true; } else { if (p4.Matcher(lastSgml).Find()) { processIt = true; } } if (processIt) { IList <string> sents = Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor.FromPlainText(buff.ToString(), true); // pw.println("Sents is " + sents); // pw.println(); if (alwaysAddS || sents.Count > 1) { int i = 1; foreach (string str in sents) { pw.Print("<s id=\"" + i + "\">"); pw.Print(str); pw.Println("</s>"); i++; } if (sents.Count > 1) { splitItems++; numAdded += sents.Count - 1; } } else { if (sents.Count == 1) { pw.Print(sents[0]); } } } else { pw.Print(buff); } buff = new StringBuilder(); } sgmlbuff.Append(s); // pw.println("sgmlbuff is " + sgmlbuff); pw.Print(sgmlbuff); lastSgml = sgmlbuff.ToString(); sgmlbuff = new StringBuilder(); } else { if (inSGML) { sgmlbuff.Append(s).Append(" "); } else { buff.Append(s).Append(" "); } } } } // pw.println("Buff is now |" + buff + "|"); // end while (tok.hasNext()) { // empty remaining buffers pw.Flush(); pw.Close(); log.Info("Split " + splitItems + " segments, adding " + numAdded + " sentences."); } else { IList <string> sent = Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor.FromHTML(input); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Error, cp.encoding), true); foreach (string a in sent) { pw.Println(a); } } }
private YMapping GetMappingValueDependent(ITokenizer tokenizer) { switch (tokenizer.Current.Value.Kind) { case TokenKind.Indent when tokenizer.Current.Next?.Value.Kind == TokenKind.MappingKey: { var mappingNode = new YMapping(tokenizer.Current.Value.IndentLevel); var items = new YKeyValueList(); tokenizer.MoveNext(); while (tokenizer.Current.Value.Kind == TokenKind.MappingKey) { var keyValueNode = this.ParseMappingKey(tokenizer); items.AddNode(keyValueNode); } while (tokenizer.Current.Value.Kind == TokenKind.Unindent) { tokenizer.MoveNext(); } mappingNode.Add(items.ToNodes()); return(mappingNode); } case TokenKind.MappingValue: { var mappingNode = new YMapping(tokenizer.Current.Value.IndentLevel); tokenizer.MoveNext(); var value = this.GetNodeValue(tokenizer); mappingNode.Add(value); return(mappingNode); } case TokenKind.Indent when tokenizer.Current.Next?.Next?.Value.Kind == TokenKind.MappingValue: { var mappingNode = new YMapping(tokenizer.Current.Value.IndentLevel); var items = new YKeyValueList(); tokenizer.MoveNext(); // Добавлеяем элементы в список do { var keyValueNode = this.ParseMappingKey(tokenizer); items.AddNode(keyValueNode); } while (tokenizer.Current.Value.Kind != TokenKind.Unindent && tokenizer.Current.Value.Kind != TokenKind.Eof && tokenizer.Current.Value.Kind != TokenKind.Indent && tokenizer.Current.Value.IndentLevel >= mappingNode.IndentLevel); // Удаляем ненужные отступы while (tokenizer.Current.Value.Kind == TokenKind.Unindent) { tokenizer.MoveNext(); } // Проверяем уровень вложенности if (tokenizer.Current.Value.IndentLevel != 0 && tokenizer.Current.Value.Kind == TokenKind.Indent) { while (tokenizer.Current.Value.IndentLevel == mappingNode.IndentLevel && tokenizer.Current.Value.Kind != TokenKind.Eof) { if (tokenizer.Current.Value.Kind == TokenKind.Indent) { tokenizer.MoveNext(); } var keyValueNode = this.ParseMappingKey(tokenizer); items.AddNode(keyValueNode); } } mappingNode.Add(items.ToNodes()); return(mappingNode); } case TokenKind.MappingBegin: { var mappingNode = new YMapping(tokenizer.Current.Value.IndentLevel); var items = new YKeyValueList(); tokenizer.MoveNext(); do { if (tokenizer.Current.Value.Kind == TokenKind.MappingEnd) { break; } var keyValueNode = this.ParseMappingKey(tokenizer); items.AddNode(keyValueNode); } while (tokenizer.Current.Value.Kind == TokenKind.ItemDelimiter && tokenizer.MoveNext()); if (tokenizer.Current.Value.Kind != TokenKind.MappingEnd) { throw ParseException.UnexpectedToken(tokenizer, TokenKind.MappingEnd); } tokenizer.MoveNext(); mappingNode.Add(items.ToNodes()); return(mappingNode); } default: return(null); } }
// todo: give options for document splitting. A line or the whole file or sentence splitting as now public virtual IEnumerator <IList <In> > GetIterator(Reader r) { ITokenizer <In> tokenizer = tokenizerFactory.GetTokenizer(r); // PTBTokenizer.newPTBTokenizer(r, false, true); IList <In> words = new List <In>(); IN previous = null; StringBuilder prepend = new StringBuilder(); /* * This changes SGML tags into whitespace -- it should maybe be moved elsewhere */ while (tokenizer.MoveNext()) { IN w = tokenizer.Current; string word = w.Get(typeof(CoreAnnotations.TextAnnotation)); Matcher m = sgml.Matcher(word); if (m.Matches()) { string before = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.BeforeAnnotation))); string after = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.AfterAnnotation))); prepend.Append(before).Append(word); if (previous != null) { string previousTokenAfter = StringUtils.GetNotNullString(previous.Get(typeof(CoreAnnotations.AfterAnnotation))); previous.Set(typeof(CoreAnnotations.AfterAnnotation), previousTokenAfter + word + after); } } else { // previous.appendAfter(w.word() + w.after()); string before = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.BeforeAnnotation))); if (prepend.Length > 0) { prepend.Append(before); w.Set(typeof(CoreAnnotations.BeforeAnnotation), prepend.ToString()); prepend = new StringBuilder(); } words.Add(w); previous = w; } } IList <IList <In> > sentences = wts.Process(words); string after_1 = string.Empty; IN last = null; foreach (IList <In> sentence in sentences) { int pos = 0; foreach (IN w in sentence) { w.Set(typeof(CoreAnnotations.PositionAnnotation), int.ToString(pos)); after_1 = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.AfterAnnotation))); w.Remove(typeof(CoreAnnotations.AfterAnnotation)); last = w; } } if (last != null) { last.Set(typeof(CoreAnnotations.AfterAnnotation), after_1); } return(sentences.GetEnumerator()); }
/// <summary>A fast, rule-based tokenizer for Spanish based on AnCora.</summary> /// <remarks> /// A fast, rule-based tokenizer for Spanish based on AnCora. /// Performs punctuation splitting and light tokenization by default. /// <p> /// Currently, this tokenizer does not do line splitting. It assumes that the input /// file is delimited by the system line separator. The output will be equivalently /// delimited. /// </p> /// </remarks> /// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Lexer options ITokenizerFactory <CoreLabel> tf = SpanishTokenizer.CoreLabelFactory(); string orthoOptions = options.Contains("ancora") ? AncoraOptions : string.Empty; if (options.Contains("options")) { orthoOptions = orthoOptions.IsEmpty() ? options.GetProperty("options") : orthoOptions + ',' + options; } bool tokens = PropertiesUtils.GetBool(options, "tokens", false); if (!tokens) { orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; } tf.SetOptions(orthoOptions); // Other options string encoding = options.GetProperty("encoding", "UTF-8"); bool toLower = PropertiesUtils.GetBool(options, "lowerCase", false); Locale es = new Locale("es"); bool onePerLine = PropertiesUtils.GetBool(options, "onePerLine", false); // Read the file from stdin int nLines = 0; int nTokens = 0; long startTime = Runtime.NanoTime(); try { ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new BufferedReader(new InputStreamReader(Runtime.@in, encoding))); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.Console.Out, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(SpanishLexer.NewlineToken)) { ++nLines; if (!onePerLine) { writer.NewLine(); printSpace = false; } } else { string outputToken = toLower ? word.ToLower(es) : word; if (onePerLine) { writer.Write(outputToken); writer.NewLine(); } else { if (printSpace) { writer.Write(" "); } writer.Write(outputToken); printSpace = true; } } } } catch (UnsupportedEncodingException e) { throw new RuntimeIOException("Bad character encoding", e); } catch (IOException e) { throw new RuntimeIOException(e); } long elapsedTime = Runtime.NanoTime() - startTime; double linesPerSec = (double)nLines / (elapsedTime / 1e9); System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec); }