public XMLIterator(DocumentPreprocessor _enclosing) { this._enclosing = _enclosing; // = null; // = null; this.xmlItr = new XMLBeginEndIterator <string>(this._enclosing.inputReader, this._enclosing.elementDelimiter); this.originalDocReader = this._enclosing.inputReader; this.PrimeNext(); }
public PlainTextIterator(DocumentPreprocessor _enclosing) { this._enclosing = _enclosing; // = null; // Establish how to find sentence boundaries bool eolIsSignificant = false; this.sentDelims = Generics.NewHashSet(); if (this._enclosing.sentenceDelimiter == null) { if (this._enclosing.sentenceFinalPuncWords != null) { Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords)); } this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers)); } else { this.sentDelims.Add(this._enclosing.sentenceDelimiter); this.delimFollowers = Generics.NewHashSet(); eolIsSignificant = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches(); if (eolIsSignificant) { // For Stanford English Tokenizer this.sentDelims.Add(PTBTokenizer.GetNewlineToken()); } } // Setup the tokenizer if (this._enclosing.tokenizerFactory == null) { eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline); this.tokenizer = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant); } else { if (eolIsSignificant) { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs"); } else { this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader); } } // If tokens are tagged, then we must split them // Note that if the token contains two or more instances of the delimiter, then the last // instance is regarded as the split point. if (this._enclosing.tagDelimiter == null) { this.splitTag = null; } else { this.splitTag = new _IFunction_281(this); } }
private static void CompareXMLResults(string input, string element, params string[] expectedResults) { List <string> results = new List <string>(); DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input)), DocumentPreprocessor.DocType.Xml); document.SetElementDelimiter(element); foreach (IList <IHasWord> sentence in document) { results.Add(SentenceUtils.ListToString(sentence)); } NUnit.Framework.Assert.AreEqual(expectedResults.Length, results.Count); for (int i = 0; i < results.Count; ++i) { NUnit.Framework.Assert.AreEqual(expectedResults[i], results[i]); } }
private static void RunTest(string input, string[] expected, string[] sentenceFinalPuncWords, bool whitespaceTokenize) { IList <string> results = new List <string>(); DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input))); if (sentenceFinalPuncWords != null) { document.SetSentenceFinalPuncWords(sentenceFinalPuncWords); } if (whitespaceTokenize) { document.SetTokenizerFactory(null); document.SetSentenceDelimiter("\n"); } foreach (IList <IHasWord> sentence in document) { results.Add(SentenceUtils.ListToString(sentence)); } NUnit.Framework.Assert.AreEqual("Should be " + expected.Length + " sentences but got " + results.Count + ": " + results, expected.Length, results.Count); for (int i = 0; i < results.Count; ++i) { NUnit.Framework.Assert.AreEqual("Failed on sentence " + i, expected[i], results[i]); } }
public virtual void TestPlainTextIterator() { string test = "This is a one line test . \n"; string[] expectedResults = new string[] { "This", "is", "a", "one", "line", "test", "." }; DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(test))); document.SetTokenizerFactory(null); document.SetSentenceDelimiter("\n"); IEnumerator <IList <IHasWord> > iterator = document.GetEnumerator(); // we test twice because this call should not eat any text NUnit.Framework.Assert.IsTrue(iterator.MoveNext()); NUnit.Framework.Assert.IsTrue(iterator.MoveNext()); IList <IHasWord> words = iterator.Current; NUnit.Framework.Assert.AreEqual(expectedResults.Length, words.Count); for (int i = 0; i < expectedResults.Length; ++i) { NUnit.Framework.Assert.AreEqual(expectedResults[i], words[i].Word()); } // we test twice to make sure we don't blow up on multiple calls NUnit.Framework.Assert.IsFalse(iterator.MoveNext()); NUnit.Framework.Assert.IsFalse(iterator.MoveNext()); try { iterator.Current; throw new AssertionError("iterator.next() should have blown up"); } catch (NoSuchElementException) { } // yay, this is what we want // just in case NUnit.Framework.Assert.IsFalse(iterator.MoveNext()); }
/// <summary>A simple, deterministic sentence-splitter.</summary> /// <remarks> /// A simple, deterministic sentence-splitter. This method only supports the English /// tokenizer, so for other languages you should run the tokenizer first and then /// run this sentence splitter with the "-whitespaceTokenization" option. /// </remarks> /// <param name="args">Command-line arguments</param> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Command-line flags string encoding = options.GetProperty("encoding", "utf-8"); bool printSentenceLengths = PropertiesUtils.GetBool(options, "printSentenceLengths", false); string xmlElementDelimiter = options.GetProperty("xml", null); DocumentPreprocessor.DocType docType = xmlElementDelimiter == null ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml; string sentenceDelimiter = options.Contains("noTokenization") ? Runtime.GetProperty("line.separator") : null; string sDelim = options.GetProperty("sentenceDelimiter"); if (sDelim != null) { if (Sharpen.Runtime.EqualsIgnoreCase(sDelim, "newline")) { sentenceDelimiter = "\n"; } else { sentenceDelimiter = sDelim; } } string tagDelimiter = options.GetProperty("tag", null); string[] sentenceDelims = null; // Setup the TokenizerFactory int numFactoryFlags = 0; bool suppressEscaping = options.Contains("suppressEscaping"); if (suppressEscaping) { numFactoryFlags += 1; } bool customTokenizer = options.Contains("tokenizerOptions"); if (customTokenizer) { numFactoryFlags += 1; } bool printOriginalText = options.Contains("printOriginalText"); if (printOriginalText) { numFactoryFlags += 1; } bool whitespaceTokenization = options.Contains("whitespaceTokenization"); if (whitespaceTokenization) { numFactoryFlags += 1; } if (numFactoryFlags > 1) { log.Info("Only one tokenizer flag allowed at a time: "); log.Info(" -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization"); return; } ITokenizerFactory <IHasWord> tf = null; if (suppressEscaping) { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "ptb3Escaping=false"); } else { if (customTokenizer) { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options.GetProperty("tokenizerOptions")); } else { if (printOriginalText) { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "invertible=true"); } else { if (whitespaceTokenization) { IList <string> whitespaceDelims = new List <string>(Arrays.AsList(DocumentPreprocessor.DefaultSentenceDelims)); whitespaceDelims.Add(WhitespaceLexer.Newline); sentenceDelims = Sharpen.Collections.ToArray(whitespaceDelims, new string[whitespaceDelims.Count]); } else { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty); } } } } string fileList = options.GetProperty(string.Empty, null); string[] files = fileList == null ? new string[1] : fileList.Split("\\s+"); int numSents = 0; PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true); foreach (string file in files) { DocumentPreprocessor docPreprocessor; if (file == null || file.IsEmpty()) { docPreprocessor = new DocumentPreprocessor(new InputStreamReader(Runtime.@in, encoding)); } else { docPreprocessor = new DocumentPreprocessor(file, docType, encoding); } if (docType == DocumentPreprocessor.DocType.Xml) { docPreprocessor.SetElementDelimiter(xmlElementDelimiter); } docPreprocessor.SetTokenizerFactory(tf); if (sentenceDelimiter != null) { docPreprocessor.SetSentenceDelimiter(sentenceDelimiter); } if (tagDelimiter != null) { docPreprocessor.SetTagDelimiter(tagDelimiter); } if (sentenceDelims != null) { docPreprocessor.SetSentenceFinalPuncWords(sentenceDelims); } foreach (IList <IHasWord> sentence in docPreprocessor) { numSents++; if (printSentenceLengths) { System.Console.Error.Printf("Length: %d%n", sentence.Count); } bool printSpace = false; foreach (IHasWord word in sentence) { if (printOriginalText) { CoreLabel cl = (CoreLabel)word; if (!printSpace) { pw.Print(cl.Get(typeof(CoreAnnotations.BeforeAnnotation))); printSpace = true; } pw.Print(cl.Get(typeof(CoreAnnotations.OriginalTextAnnotation))); pw.Print(cl.Get(typeof(CoreAnnotations.AfterAnnotation))); } else { if (printSpace) { pw.Print(" "); } printSpace = true; pw.Print(word.Word()); } } pw.Println(); } } pw.Close(); System.Console.Error.Printf("Read in %d sentences.%n", numSents); }