public PlainTextIterator(DocumentPreprocessor _enclosing) { this._enclosing = _enclosing; // = null; // Establish how to find sentence boundaries bool eolIsSignificant = false; this.sentDelims = Generics.NewHashSet(); if (this._enclosing.sentenceDelimiter == null) { if (this._enclosing.sentenceFinalPuncWords != null) { Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords)); } this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers)); } else { this.sentDelims.Add(this._enclosing.sentenceDelimiter); this.delimFollowers = Generics.NewHashSet(); eolIsSignificant = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches(); if (eolIsSignificant) { // For Stanford English Tokenizer this.sentDelims.Add(PTBTokenizer.GetNewlineToken()); } } // Setup the tokenizer if (this._enclosing.tokenizerFactory == null) { eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline); this.tokenizer = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant); } else { if (eolIsSignificant) { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs"); } else { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader); } } // If tokens are tagged, then we must split them // Note that if the token contains two or more instances of the delimiter, then the last // instance is regarded as the split point. if (this._enclosing.tagDelimiter == null) { this.splitTag = null; } else { this.splitTag = new _IFunction_281(this); } }
/// <summary>Test program for demonstrating the Stemmer.</summary> /// <remarks> /// Test program for demonstrating the Stemmer. It reads text from a /// a list of files, stems each word, and writes the result to standard /// output. Note that the word stemmed is expected to be in lower case: /// forcing lower case must be done outside the Stemmer class. /// Usage: Stemmer file-name file-name ... /// </remarks> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { Edu.Stanford.Nlp.Process.Stemmer s = new Edu.Stanford.Nlp.Process.Stemmer(); if (args[0].Equals("-file")) { IEnumerator <Word> it = PTBTokenizer.NewPTBTokenizer(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); while (it.MoveNext()) { Word token = it.Current; System.Console.Out.Write(s.Stem(token.Word())); System.Console.Out.Write(' '); } } else { foreach (string arg in args) { System.Console.Out.Write(s.Stem(arg)); System.Console.Out.Write(' '); } } System.Console.Out.WriteLine(); }
/// <summary>A simple, deterministic sentence-splitter.</summary> /// <remarks> /// A simple, deterministic sentence-splitter. This method only supports the English /// tokenizer, so for other languages you should run the tokenizer first and then /// run this sentence splitter with the "-whitespaceTokenization" option. /// </remarks> /// <param name="args">Command-line arguments</param> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Command-line flags string encoding = options.GetProperty("encoding", "utf-8"); bool printSentenceLengths = PropertiesUtils.GetBool(options, "printSentenceLengths", false); string xmlElementDelimiter = options.GetProperty("xml", null); DocumentPreprocessor.DocType docType = xmlElementDelimiter == null ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml; string sentenceDelimiter = options.Contains("noTokenization") ? Runtime.GetProperty("line.separator") : null; string sDelim = options.GetProperty("sentenceDelimiter"); if (sDelim != null) { if (Sharpen.Runtime.EqualsIgnoreCase(sDelim, "newline")) { sentenceDelimiter = "\n"; } else { sentenceDelimiter = sDelim; } } string tagDelimiter = options.GetProperty("tag", null); string[] sentenceDelims = null; // Setup the TokenizerFactory int numFactoryFlags = 0; bool suppressEscaping = options.Contains("suppressEscaping"); if (suppressEscaping) { numFactoryFlags += 1; } bool customTokenizer = options.Contains("tokenizerOptions"); if (customTokenizer) { numFactoryFlags += 1; } bool printOriginalText = options.Contains("printOriginalText"); if (printOriginalText) { numFactoryFlags += 1; } bool whitespaceTokenization = options.Contains("whitespaceTokenization"); if (whitespaceTokenization) { numFactoryFlags += 1; } if (numFactoryFlags > 1) { log.Info("Only one tokenizer flag allowed at a time: "); log.Info(" -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization"); return; } ITokenizerFactory <IHasWord> tf = null; if (suppressEscaping) { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "ptb3Escaping=false"); } else { if (customTokenizer) { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options.GetProperty("tokenizerOptions")); } else { if (printOriginalText) { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "invertible=true"); } else { if (whitespaceTokenization) { IList <string> whitespaceDelims = new List <string>(Arrays.AsList(DocumentPreprocessor.DefaultSentenceDelims)); whitespaceDelims.Add(WhitespaceLexer.Newline); sentenceDelims = Sharpen.Collections.ToArray(whitespaceDelims, new string[whitespaceDelims.Count]); } else { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty); } } } } string fileList = options.GetProperty(string.Empty, null); string[] files = fileList == null ? new string[1] : fileList.Split("\\s+"); int numSents = 0; PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true); foreach (string file in files) { DocumentPreprocessor docPreprocessor; if (file == null || file.IsEmpty()) { docPreprocessor = new DocumentPreprocessor(new InputStreamReader(Runtime.@in, encoding)); } else { docPreprocessor = new DocumentPreprocessor(file, docType, encoding); } if (docType == DocumentPreprocessor.DocType.Xml) { docPreprocessor.SetElementDelimiter(xmlElementDelimiter); } docPreprocessor.SetTokenizerFactory(tf); if (sentenceDelimiter != null) { docPreprocessor.SetSentenceDelimiter(sentenceDelimiter); } if (tagDelimiter != null) { docPreprocessor.SetTagDelimiter(tagDelimiter); } if (sentenceDelims != null) { docPreprocessor.SetSentenceFinalPuncWords(sentenceDelims); } foreach (IList <IHasWord> sentence in docPreprocessor) { numSents++; if (printSentenceLengths) { System.Console.Error.Printf("Length: %d%n", sentence.Count); } bool printSpace = false; foreach (IHasWord word in sentence) { if (printOriginalText) { CoreLabel cl = (CoreLabel)word; if (!printSpace) { pw.Print(cl.Get(typeof(CoreAnnotations.BeforeAnnotation))); printSpace = true; } pw.Print(cl.Get(typeof(CoreAnnotations.OriginalTextAnnotation))); pw.Print(cl.Get(typeof(CoreAnnotations.AfterAnnotation))); } else { if (printSpace) { pw.Print(" "); } printSpace = true; pw.Print(word.Word()); } } pw.Println(); } } pw.Close(); System.Console.Error.Printf("Read in %d sentences.%n", numSents); }
private void PrimeNext() { if (this._enclosing.inputReader == null) { // we've already been out of stuff and have closed the input reader; so just return return; } this.nextSent = Generics.NewArrayList(this.nextSentCarryover); this.nextSentCarryover.Clear(); bool seenBoundary = false; if (!this.tokenizer.MoveNext()) { IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader); this._enclosing.inputReader = null; // nextSent = null; // WRONG: There may be something in it from the nextSentCarryover if (this.nextSent.IsEmpty()) { this.nextSent = null; } return; } do { IHasWord token = this.tokenizer.Current; if (this.splitTag != null) { string[] toks = this.splitTag.Apply(token.Word()); token.SetWord(toks[0]); if (token is ILabel) { ((ILabel)token).SetValue(toks[0]); } if (toks.Length == 2 && token is IHasTag) { //wsg2011: Some of the underlying tokenizers return old //JavaNLP labels. We could convert to CoreLabel here, but //we choose a conservative implementation.... ((IHasTag)token).SetTag(toks[1]); } } if (this.sentDelims.Contains(token.Word())) { seenBoundary = true; } else { if (seenBoundary && !this.delimFollowers.Contains(token.Word())) { this.nextSentCarryover.Add(token); break; } } if (!(DocumentPreprocessor.wsPattern.Matcher(token.Word()).Matches() || token.Word().Equals(PTBTokenizer.GetNewlineToken()))) { this.nextSent.Add(token); } // If there are no words that can follow a sentence delimiter, // then there are two cases. In one case is we already have a // sentence, in which case there is no reason to look at the // next token, since that just causes buffering without any // chance of the current sentence being extended, since // delimFollowers = {}. In the other case, we have an empty // sentence, which at this point means the sentence delimiter // was a whitespace token such as \n. We might as well keep // going as if we had never seen anything. if (seenBoundary && this.delimFollowers.IsEmpty()) { if (!this.nextSent.IsEmpty() || this._enclosing.keepEmptySentences) { break; } else { seenBoundary = false; } } }while (this.tokenizer.MoveNext()); if (this.nextSent.IsEmpty() && this.nextSentCarryover.IsEmpty() && !this._enclosing.keepEmptySentences) { IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader); this._enclosing.inputReader = null; this.nextSent = null; } else { if (this._enclosing.escaper != null) { this.nextSent = this._enclosing.escaper.Apply(this.nextSent); } } }