Beispiel #1
0
            public PlainTextIterator(DocumentPreprocessor _enclosing)
            {
                this._enclosing = _enclosing;
                // = null;
                // Establish how to find sentence boundaries
                bool eolIsSignificant = false;

                this.sentDelims = Generics.NewHashSet();
                if (this._enclosing.sentenceDelimiter == null)
                {
                    if (this._enclosing.sentenceFinalPuncWords != null)
                    {
                        Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords));
                    }
                    this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers));
                }
                else
                {
                    this.sentDelims.Add(this._enclosing.sentenceDelimiter);
                    this.delimFollowers = Generics.NewHashSet();
                    eolIsSignificant    = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches();
                    if (eolIsSignificant)
                    {
                        // For Stanford English Tokenizer
                        this.sentDelims.Add(PTBTokenizer.GetNewlineToken());
                    }
                }
                // Setup the tokenizer
                if (this._enclosing.tokenizerFactory == null)
                {
                    eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline);
                    this.tokenizer   = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant);
                }
                else
                {
                    if (eolIsSignificant)
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs");
                    }
                    else
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader);
                    }
                }
                // If tokens are tagged, then we must split them
                // Note that if the token contains two or more instances of the delimiter, then the last
                // instance is regarded as the split point.
                if (this._enclosing.tagDelimiter == null)
                {
                    this.splitTag = null;
                }
                else
                {
                    this.splitTag = new _IFunction_281(this);
                }
            }
 /// <summary>Test program for demonstrating the Stemmer.</summary>
 /// <remarks>
 /// Test program for demonstrating the Stemmer.  It reads text from a
 /// a list of files, stems each word, and writes the result to standard
 /// output. Note that the word stemmed is expected to be in lower case:
 /// forcing lower case must be done outside the Stemmer class.
 /// Usage: Stemmer file-name file-name ...
 /// </remarks>
 /// <exception cref="System.IO.IOException"/>
 public static void Main(string[] args)
 {
     Edu.Stanford.Nlp.Process.Stemmer s = new Edu.Stanford.Nlp.Process.Stemmer();
     if (args[0].Equals("-file"))
     {
         IEnumerator <Word> it = PTBTokenizer.NewPTBTokenizer(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
         while (it.MoveNext())
         {
             Word token = it.Current;
             System.Console.Out.Write(s.Stem(token.Word()));
             System.Console.Out.Write(' ');
         }
     }
     else
     {
         foreach (string arg in args)
         {
             System.Console.Out.Write(s.Stem(arg));
             System.Console.Out.Write(' ');
         }
     }
     System.Console.Out.WriteLine();
 }
Beispiel #3
0
        /// <summary>A simple, deterministic sentence-splitter.</summary>
        /// <remarks>
        /// A simple, deterministic sentence-splitter. This method only supports the English
        /// tokenizer, so for other languages you should run the tokenizer first and then
        /// run this sentence splitter with the "-whitespaceTokenization" option.
        /// </remarks>
        /// <param name="args">Command-line arguments</param>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Command-line flags
            string encoding             = options.GetProperty("encoding", "utf-8");
            bool   printSentenceLengths = PropertiesUtils.GetBool(options, "printSentenceLengths", false);
            string xmlElementDelimiter  = options.GetProperty("xml", null);

            DocumentPreprocessor.DocType docType = xmlElementDelimiter == null ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml;
            string sentenceDelimiter             = options.Contains("noTokenization") ? Runtime.GetProperty("line.separator") : null;
            string sDelim = options.GetProperty("sentenceDelimiter");

            if (sDelim != null)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(sDelim, "newline"))
                {
                    sentenceDelimiter = "\n";
                }
                else
                {
                    sentenceDelimiter = sDelim;
                }
            }
            string tagDelimiter = options.GetProperty("tag", null);

            string[] sentenceDelims = null;
            // Setup the TokenizerFactory
            int  numFactoryFlags  = 0;
            bool suppressEscaping = options.Contains("suppressEscaping");

            if (suppressEscaping)
            {
                numFactoryFlags += 1;
            }
            bool customTokenizer = options.Contains("tokenizerOptions");

            if (customTokenizer)
            {
                numFactoryFlags += 1;
            }
            bool printOriginalText = options.Contains("printOriginalText");

            if (printOriginalText)
            {
                numFactoryFlags += 1;
            }
            bool whitespaceTokenization = options.Contains("whitespaceTokenization");

            if (whitespaceTokenization)
            {
                numFactoryFlags += 1;
            }
            if (numFactoryFlags > 1)
            {
                log.Info("Only one tokenizer flag allowed at a time: ");
                log.Info("  -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
                return;
            }
            ITokenizerFactory <IHasWord> tf = null;

            if (suppressEscaping)
            {
                tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
            }
            else
            {
                if (customTokenizer)
                {
                    tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options.GetProperty("tokenizerOptions"));
                }
                else
                {
                    if (printOriginalText)
                    {
                        tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "invertible=true");
                    }
                    else
                    {
                        if (whitespaceTokenization)
                        {
                            IList <string> whitespaceDelims = new List <string>(Arrays.AsList(DocumentPreprocessor.DefaultSentenceDelims));
                            whitespaceDelims.Add(WhitespaceLexer.Newline);
                            sentenceDelims = Sharpen.Collections.ToArray(whitespaceDelims, new string[whitespaceDelims.Count]);
                        }
                        else
                        {
                            tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
                        }
                    }
                }
            }
            string fileList = options.GetProperty(string.Empty, null);

            string[]    files    = fileList == null ? new string[1] : fileList.Split("\\s+");
            int         numSents = 0;
            PrintWriter pw       = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true);

            foreach (string file in files)
            {
                DocumentPreprocessor docPreprocessor;
                if (file == null || file.IsEmpty())
                {
                    docPreprocessor = new DocumentPreprocessor(new InputStreamReader(Runtime.@in, encoding));
                }
                else
                {
                    docPreprocessor = new DocumentPreprocessor(file, docType, encoding);
                }
                if (docType == DocumentPreprocessor.DocType.Xml)
                {
                    docPreprocessor.SetElementDelimiter(xmlElementDelimiter);
                }
                docPreprocessor.SetTokenizerFactory(tf);
                if (sentenceDelimiter != null)
                {
                    docPreprocessor.SetSentenceDelimiter(sentenceDelimiter);
                }
                if (tagDelimiter != null)
                {
                    docPreprocessor.SetTagDelimiter(tagDelimiter);
                }
                if (sentenceDelims != null)
                {
                    docPreprocessor.SetSentenceFinalPuncWords(sentenceDelims);
                }
                foreach (IList <IHasWord> sentence in docPreprocessor)
                {
                    numSents++;
                    if (printSentenceLengths)
                    {
                        System.Console.Error.Printf("Length: %d%n", sentence.Count);
                    }
                    bool printSpace = false;
                    foreach (IHasWord word in sentence)
                    {
                        if (printOriginalText)
                        {
                            CoreLabel cl = (CoreLabel)word;
                            if (!printSpace)
                            {
                                pw.Print(cl.Get(typeof(CoreAnnotations.BeforeAnnotation)));
                                printSpace = true;
                            }
                            pw.Print(cl.Get(typeof(CoreAnnotations.OriginalTextAnnotation)));
                            pw.Print(cl.Get(typeof(CoreAnnotations.AfterAnnotation)));
                        }
                        else
                        {
                            if (printSpace)
                            {
                                pw.Print(" ");
                            }
                            printSpace = true;
                            pw.Print(word.Word());
                        }
                    }
                    pw.Println();
                }
            }
            pw.Close();
            System.Console.Error.Printf("Read in %d sentences.%n", numSents);
        }
Beispiel #4
0
            private void PrimeNext()
            {
                if (this._enclosing.inputReader == null)
                {
                    // we've already been out of stuff and have closed the input reader; so just return
                    return;
                }
                this.nextSent = Generics.NewArrayList(this.nextSentCarryover);
                this.nextSentCarryover.Clear();
                bool seenBoundary = false;

                if (!this.tokenizer.MoveNext())
                {
                    IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader);
                    this._enclosing.inputReader = null;
                    // nextSent = null; // WRONG: There may be something in it from the nextSentCarryover
                    if (this.nextSent.IsEmpty())
                    {
                        this.nextSent = null;
                    }
                    return;
                }
                do
                {
                    IHasWord token = this.tokenizer.Current;
                    if (this.splitTag != null)
                    {
                        string[] toks = this.splitTag.Apply(token.Word());
                        token.SetWord(toks[0]);
                        if (token is ILabel)
                        {
                            ((ILabel)token).SetValue(toks[0]);
                        }
                        if (toks.Length == 2 && token is IHasTag)
                        {
                            //wsg2011: Some of the underlying tokenizers return old
                            //JavaNLP labels.  We could convert to CoreLabel here, but
                            //we choose a conservative implementation....
                            ((IHasTag)token).SetTag(toks[1]);
                        }
                    }
                    if (this.sentDelims.Contains(token.Word()))
                    {
                        seenBoundary = true;
                    }
                    else
                    {
                        if (seenBoundary && !this.delimFollowers.Contains(token.Word()))
                        {
                            this.nextSentCarryover.Add(token);
                            break;
                        }
                    }
                    if (!(DocumentPreprocessor.wsPattern.Matcher(token.Word()).Matches() || token.Word().Equals(PTBTokenizer.GetNewlineToken())))
                    {
                        this.nextSent.Add(token);
                    }
                    // If there are no words that can follow a sentence delimiter,
                    // then there are two cases.  In one case is we already have a
                    // sentence, in which case there is no reason to look at the
                    // next token, since that just causes buffering without any
                    // chance of the current sentence being extended, since
                    // delimFollowers = {}.  In the other case, we have an empty
                    // sentence, which at this point means the sentence delimiter
                    // was a whitespace token such as \n.  We might as well keep
                    // going as if we had never seen anything.
                    if (seenBoundary && this.delimFollowers.IsEmpty())
                    {
                        if (!this.nextSent.IsEmpty() || this._enclosing.keepEmptySentences)
                        {
                            break;
                        }
                        else
                        {
                            seenBoundary = false;
                        }
                    }
                }while (this.tokenizer.MoveNext());
                if (this.nextSent.IsEmpty() && this.nextSentCarryover.IsEmpty() && !this._enclosing.keepEmptySentences)
                {
                    IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader);
                    this._enclosing.inputReader = null;
                    this.nextSent = null;
                }
                else
                {
                    if (this._enclosing.escaper != null)
                    {
                        this.nextSent = this._enclosing.escaper.Apply(this.nextSent);
                    }
                }
            }