public PlainTextIterator(DocumentPreprocessor _enclosing) { this._enclosing = _enclosing; // = null; // Establish how to find sentence boundaries bool eolIsSignificant = false; this.sentDelims = Generics.NewHashSet(); if (this._enclosing.sentenceDelimiter == null) { if (this._enclosing.sentenceFinalPuncWords != null) { Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords)); } this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers)); } else { this.sentDelims.Add(this._enclosing.sentenceDelimiter); this.delimFollowers = Generics.NewHashSet(); eolIsSignificant = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches(); if (eolIsSignificant) { // For Stanford English Tokenizer this.sentDelims.Add(PTBTokenizer.GetNewlineToken()); } } // Setup the tokenizer if (this._enclosing.tokenizerFactory == null) { eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline); this.tokenizer = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant); } else { if (eolIsSignificant) { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs"); } else { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader); } } // If tokens are tagged, then we must split them // Note that if the token contains two or more instances of the delimiter, then the last // instance is regarded as the split point. if (this._enclosing.tagDelimiter == null) { this.splitTag = null; } else { this.splitTag = new _IFunction_281(this); } }
public virtual ITokenizer <IHasWord> GetTokenizer(Reader r, string extraOptions) { bool tokenizeNewlines = this.tokenizeNLs; if (extraOptions != null) { Properties prop = StringUtils.StringToProperties(extraOptions); tokenizeNewlines = PropertiesUtils.GetBool(prop, "tokenizeNLs", this.tokenizeNLs); } return(new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r, tokenizeNewlines))); }
/// <summary>Reads a file from the argument and prints its tokens one per line.</summary> /// <remarks> /// Reads a file from the argument and prints its tokens one per line. /// This is mainly as a testing aid, but it can also be quite useful /// standalone to turn a corpus into a one token per line file of tokens. /// Usage: /// <c>java edu.stanford.nlp.process.WhitespaceTokenizer filename</c> /// </remarks> /// <param name="args">Command line arguments</param> /// <exception cref="System.IO.IOException">If can't open files, etc.</exception> public static void Main(string[] args) { bool eolIsSignificant = (args.Length > 0 && args[0].Equals("-cr")); Reader reader = ((args.Length > 0 && !args[args.Length - 1].Equals("-cr")) ? new InputStreamReader(new FileInputStream(args[args.Length - 1]), "UTF-8") : new InputStreamReader(Runtime.@in, "UTF-8")); WhitespaceTokenizer <Word> tokenizer = new WhitespaceTokenizer <Word>(new WordTokenFactory(), reader, eolIsSignificant); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, "UTF-8"), true); while (tokenizer.MoveNext()) { Word w = tokenizer.Current; if (w.Value().Equals(WhitespaceLexer.Newline)) { pw.Println("***CR***"); } else { pw.Println(w); } } }
public virtual void TestWordTokenizer() { RunTest(WhitespaceTokenizer.Factory(false), Test, ResultsNoEol); RunTest(WhitespaceTokenizer.Factory(true), Test, ResultsEol); }
public WordSegmentingTokenizer(IWordSegmenter segmenter, Reader r) : this(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r)) { }
/// <summary> /// usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] /// -file filename [-encoding encoding] /// <p> /// The -segmentIBM option is for IBM GALE-specific splitting of an /// XML element into sentences. /// </summary> /// <exception cref="System.Exception"/> public static void Main(string[] args) { //String encoding = "GB18030"; Properties props = StringUtils.ArgsToProperties(args); // log.info("Here are the properties:"); // props.list(System.err); bool alwaysAddS = props.Contains("alwaysAddS"); Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor cp; if (!props.Contains("file")) { log.Info("usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] -file filename [-encoding encoding]"); return; } cp = new Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor(); if (props.Contains("encoding")) { log.Info("WARNING: for now the default encoding is " + cp.encoding + ". It's not changeable for now"); } string input = IOUtils.SlurpFileNoExceptions(props.GetProperty("file"), cp.encoding); // String input = StringUtils.slurpGBURLNoExceptions(new URL(props.getProperty("file"))); if (props.Contains("segmentIBM")) { ITokenizer <Word> tok = WhitespaceTokenizer.NewWordWhitespaceTokenizer(new StringReader(input), true); string parseInside = props.GetProperty("parseInside"); if (parseInside == null) { parseInside = string.Empty; } Pattern p1; Pattern p2; Pattern p3; Pattern p4; PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, cp.encoding), true); StringBuilder buff = new StringBuilder(); StringBuilder sgmlbuff = new StringBuilder(); string lastSgml = string.Empty; p1 = Pattern.Compile("<.*>"); p2 = Pattern.Compile("\uFEFF?<[\\p{Alpha}]+"); p3 = Pattern.Compile("[A-Za-z0-9=\"]+>"); p4 = Pattern.Compile("<(?:" + parseInside + ")[ >]"); bool inSGML = false; int splitItems = 0; int numAdded = 0; while (tok.MoveNext()) { string s = tok.Current.Word(); // pw.println("The token is |" + s + "|"); if (p2.Matcher(s).Matches()) { inSGML = true; sgmlbuff.Append(s).Append(" "); } else { if (p1.Matcher(s).Matches() || inSGML && p3.Matcher(s).Matches() || "\n".Equals(s)) { inSGML = false; if (buff.ToString().Trim().Length > 0) { // pw.println("Dumping sentences"); // pw.println("Buff is " + buff); bool processIt = false; if (parseInside.Equals(string.Empty)) { processIt = true; } else { if (p4.Matcher(lastSgml).Find()) { processIt = true; } } if (processIt) { IList <string> sents = Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor.FromPlainText(buff.ToString(), true); // pw.println("Sents is " + sents); // pw.println(); if (alwaysAddS || sents.Count > 1) { int i = 1; foreach (string str in sents) { pw.Print("<s id=\"" + i + "\">"); pw.Print(str); pw.Println("</s>"); i++; } if (sents.Count > 1) { splitItems++; numAdded += sents.Count - 1; } } else { if (sents.Count == 1) { pw.Print(sents[0]); } } } else { pw.Print(buff); } buff = new StringBuilder(); } sgmlbuff.Append(s); // pw.println("sgmlbuff is " + sgmlbuff); pw.Print(sgmlbuff); lastSgml = sgmlbuff.ToString(); sgmlbuff = new StringBuilder(); } else { if (inSGML) { sgmlbuff.Append(s).Append(" "); } else { buff.Append(s).Append(" "); } } } } // pw.println("Buff is now |" + buff + "|"); // end while (tok.hasNext()) { // empty remaining buffers pw.Flush(); pw.Close(); log.Info("Split " + splitItems + " segments, adding " + numAdded + " sentences."); } else { IList <string> sent = Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor.FromHTML(input); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Error, cp.encoding), true); foreach (string a in sent) { pw.Println(a); } } }