Exemple #1
0
 public XMLIterator(DocumentPreprocessor _enclosing)
 {
     this._enclosing = _enclosing;
     // = null;
     // = null;
     this.xmlItr            = new XMLBeginEndIterator <string>(this._enclosing.inputReader, this._enclosing.elementDelimiter);
     this.originalDocReader = this._enclosing.inputReader;
     this.PrimeNext();
 }
Exemple #2
0
            public PlainTextIterator(DocumentPreprocessor _enclosing)
            {
                this._enclosing = _enclosing;
                // = null;
                // Establish how to find sentence boundaries
                bool eolIsSignificant = false;

                this.sentDelims = Generics.NewHashSet();
                if (this._enclosing.sentenceDelimiter == null)
                {
                    if (this._enclosing.sentenceFinalPuncWords != null)
                    {
                        Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords));
                    }
                    this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers));
                }
                else
                {
                    this.sentDelims.Add(this._enclosing.sentenceDelimiter);
                    this.delimFollowers = Generics.NewHashSet();
                    eolIsSignificant    = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches();
                    if (eolIsSignificant)
                    {
                        // For Stanford English Tokenizer
                        this.sentDelims.Add(PTBTokenizer.GetNewlineToken());
                    }
                }
                // Setup the tokenizer
                if (this._enclosing.tokenizerFactory == null)
                {
                    eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline);
                    this.tokenizer   = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant);
                }
                else
                {
                    if (eolIsSignificant)
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs");
                    }
                    else
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader);
                    }
                }
                // If tokens are tagged, then we must split them
                // Note that if the token contains two or more instances of the delimiter, then the last
                // instance is regarded as the split point.
                if (this._enclosing.tagDelimiter == null)
                {
                    this.splitTag = null;
                }
                else
                {
                    this.splitTag = new _IFunction_281(this);
                }
            }
        private static void CompareXMLResults(string input, string element, params string[] expectedResults)
        {
            List <string>        results  = new List <string>();
            DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input)), DocumentPreprocessor.DocType.Xml);

            document.SetElementDelimiter(element);
            foreach (IList <IHasWord> sentence in document)
            {
                results.Add(SentenceUtils.ListToString(sentence));
            }
            NUnit.Framework.Assert.AreEqual(expectedResults.Length, results.Count);
            for (int i = 0; i < results.Count; ++i)
            {
                NUnit.Framework.Assert.AreEqual(expectedResults[i], results[i]);
            }
        }
        private static void RunTest(string input, string[] expected, string[] sentenceFinalPuncWords, bool whitespaceTokenize)
        {
            IList <string>       results  = new List <string>();
            DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input)));

            if (sentenceFinalPuncWords != null)
            {
                document.SetSentenceFinalPuncWords(sentenceFinalPuncWords);
            }
            if (whitespaceTokenize)
            {
                document.SetTokenizerFactory(null);
                document.SetSentenceDelimiter("\n");
            }
            foreach (IList <IHasWord> sentence in document)
            {
                results.Add(SentenceUtils.ListToString(sentence));
            }
            NUnit.Framework.Assert.AreEqual("Should be " + expected.Length + " sentences but got " + results.Count + ": " + results, expected.Length, results.Count);
            for (int i = 0; i < results.Count; ++i)
            {
                NUnit.Framework.Assert.AreEqual("Failed on sentence " + i, expected[i], results[i]);
            }
        }
        public virtual void TestPlainTextIterator()
        {
            string test = "This is a one line test . \n";

            string[]             expectedResults = new string[] { "This", "is", "a", "one", "line", "test", "." };
            DocumentPreprocessor document        = new DocumentPreprocessor(new BufferedReader(new StringReader(test)));

            document.SetTokenizerFactory(null);
            document.SetSentenceDelimiter("\n");
            IEnumerator <IList <IHasWord> > iterator = document.GetEnumerator();

            // we test twice because this call should not eat any text
            NUnit.Framework.Assert.IsTrue(iterator.MoveNext());
            NUnit.Framework.Assert.IsTrue(iterator.MoveNext());
            IList <IHasWord> words = iterator.Current;

            NUnit.Framework.Assert.AreEqual(expectedResults.Length, words.Count);
            for (int i = 0; i < expectedResults.Length; ++i)
            {
                NUnit.Framework.Assert.AreEqual(expectedResults[i], words[i].Word());
            }
            // we test twice to make sure we don't blow up on multiple calls
            NUnit.Framework.Assert.IsFalse(iterator.MoveNext());
            NUnit.Framework.Assert.IsFalse(iterator.MoveNext());
            try
            {
                iterator.Current;
                throw new AssertionError("iterator.next() should have blown up");
            }
            catch (NoSuchElementException)
            {
            }
            // yay, this is what we want
            // just in case
            NUnit.Framework.Assert.IsFalse(iterator.MoveNext());
        }
Exemple #6
0
        /// <summary>A simple, deterministic sentence-splitter.</summary>
        /// <remarks>
        /// A simple, deterministic sentence-splitter. This method only supports the English
        /// tokenizer, so for other languages you should run the tokenizer first and then
        /// run this sentence splitter with the "-whitespaceTokenization" option.
        /// </remarks>
        /// <param name="args">Command-line arguments</param>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Command-line flags
            string encoding             = options.GetProperty("encoding", "utf-8");
            bool   printSentenceLengths = PropertiesUtils.GetBool(options, "printSentenceLengths", false);
            string xmlElementDelimiter  = options.GetProperty("xml", null);

            DocumentPreprocessor.DocType docType = xmlElementDelimiter == null ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml;
            string sentenceDelimiter             = options.Contains("noTokenization") ? Runtime.GetProperty("line.separator") : null;
            string sDelim = options.GetProperty("sentenceDelimiter");

            if (sDelim != null)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(sDelim, "newline"))
                {
                    sentenceDelimiter = "\n";
                }
                else
                {
                    sentenceDelimiter = sDelim;
                }
            }
            string tagDelimiter = options.GetProperty("tag", null);

            string[] sentenceDelims = null;
            // Setup the TokenizerFactory
            int  numFactoryFlags  = 0;
            bool suppressEscaping = options.Contains("suppressEscaping");

            if (suppressEscaping)
            {
                numFactoryFlags += 1;
            }
            bool customTokenizer = options.Contains("tokenizerOptions");

            if (customTokenizer)
            {
                numFactoryFlags += 1;
            }
            bool printOriginalText = options.Contains("printOriginalText");

            if (printOriginalText)
            {
                numFactoryFlags += 1;
            }
            bool whitespaceTokenization = options.Contains("whitespaceTokenization");

            if (whitespaceTokenization)
            {
                numFactoryFlags += 1;
            }
            if (numFactoryFlags > 1)
            {
                log.Info("Only one tokenizer flag allowed at a time: ");
                log.Info("  -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
                return;
            }
            ITokenizerFactory <IHasWord> tf = null;

            if (suppressEscaping)
            {
                tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
            }
            else
            {
                if (customTokenizer)
                {
                    tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options.GetProperty("tokenizerOptions"));
                }
                else
                {
                    if (printOriginalText)
                    {
                        tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "invertible=true");
                    }
                    else
                    {
                        if (whitespaceTokenization)
                        {
                            IList <string> whitespaceDelims = new List <string>(Arrays.AsList(DocumentPreprocessor.DefaultSentenceDelims));
                            whitespaceDelims.Add(WhitespaceLexer.Newline);
                            sentenceDelims = Sharpen.Collections.ToArray(whitespaceDelims, new string[whitespaceDelims.Count]);
                        }
                        else
                        {
                            tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
                        }
                    }
                }
            }
            string fileList = options.GetProperty(string.Empty, null);

            string[]    files    = fileList == null ? new string[1] : fileList.Split("\\s+");
            int         numSents = 0;
            PrintWriter pw       = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true);

            foreach (string file in files)
            {
                DocumentPreprocessor docPreprocessor;
                if (file == null || file.IsEmpty())
                {
                    docPreprocessor = new DocumentPreprocessor(new InputStreamReader(Runtime.@in, encoding));
                }
                else
                {
                    docPreprocessor = new DocumentPreprocessor(file, docType, encoding);
                }
                if (docType == DocumentPreprocessor.DocType.Xml)
                {
                    docPreprocessor.SetElementDelimiter(xmlElementDelimiter);
                }
                docPreprocessor.SetTokenizerFactory(tf);
                if (sentenceDelimiter != null)
                {
                    docPreprocessor.SetSentenceDelimiter(sentenceDelimiter);
                }
                if (tagDelimiter != null)
                {
                    docPreprocessor.SetTagDelimiter(tagDelimiter);
                }
                if (sentenceDelims != null)
                {
                    docPreprocessor.SetSentenceFinalPuncWords(sentenceDelims);
                }
                foreach (IList <IHasWord> sentence in docPreprocessor)
                {
                    numSents++;
                    if (printSentenceLengths)
                    {
                        System.Console.Error.Printf("Length: %d%n", sentence.Count);
                    }
                    bool printSpace = false;
                    foreach (IHasWord word in sentence)
                    {
                        if (printOriginalText)
                        {
                            CoreLabel cl = (CoreLabel)word;
                            if (!printSpace)
                            {
                                pw.Print(cl.Get(typeof(CoreAnnotations.BeforeAnnotation)));
                                printSpace = true;
                            }
                            pw.Print(cl.Get(typeof(CoreAnnotations.OriginalTextAnnotation)));
                            pw.Print(cl.Get(typeof(CoreAnnotations.AfterAnnotation)));
                        }
                        else
                        {
                            if (printSpace)
                            {
                                pw.Print(" ");
                            }
                            printSpace = true;
                            pw.Print(word.Word());
                        }
                    }
                    pw.Println();
                }
            }
            pw.Close();
            System.Console.Error.Printf("Read in %d sentences.%n", numSents);
        }