Example #1
0
            public PlainTextIterator(DocumentPreprocessor _enclosing)
            {
                this._enclosing = _enclosing;
                // = null;
                // Establish how to find sentence boundaries
                bool eolIsSignificant = false;

                this.sentDelims = Generics.NewHashSet();
                if (this._enclosing.sentenceDelimiter == null)
                {
                    if (this._enclosing.sentenceFinalPuncWords != null)
                    {
                        Sharpen.Collections.AddAll(this.sentDelims, Arrays.AsList(this._enclosing.sentenceFinalPuncWords));
                    }
                    this.delimFollowers = Generics.NewHashSet(Arrays.AsList(this._enclosing.sentenceFinalFollowers));
                }
                else
                {
                    this.sentDelims.Add(this._enclosing.sentenceDelimiter);
                    this.delimFollowers = Generics.NewHashSet();
                    eolIsSignificant    = DocumentPreprocessor.wsPattern.Matcher(this._enclosing.sentenceDelimiter).Matches();
                    if (eolIsSignificant)
                    {
                        // For Stanford English Tokenizer
                        this.sentDelims.Add(PTBTokenizer.GetNewlineToken());
                    }
                }
                // Setup the tokenizer
                if (this._enclosing.tokenizerFactory == null)
                {
                    eolIsSignificant = this.sentDelims.Contains(WhitespaceLexer.Newline);
                    this.tokenizer   = WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(this._enclosing.inputReader, eolIsSignificant);
                }
                else
                {
                    if (eolIsSignificant)
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader, "tokenizeNLs");
                    }
                    else
                    {
                        this.tokenizer = this._enclosing.tokenizerFactory.GetTokenizer(this._enclosing.inputReader);
                    }
                }
                // If tokens are tagged, then we must split them
                // Note that if the token contains two or more instances of the delimiter, then the last
                // instance is regarded as the split point.
                if (this._enclosing.tagDelimiter == null)
                {
                    this.splitTag = null;
                }
                else
                {
                    this.splitTag = new _IFunction_281(this);
                }
            }
Example #2
0
            public virtual ITokenizer <IHasWord> GetTokenizer(Reader r, string extraOptions)
            {
                bool tokenizeNewlines = this.tokenizeNLs;

                if (extraOptions != null)
                {
                    Properties prop = StringUtils.StringToProperties(extraOptions);
                    tokenizeNewlines = PropertiesUtils.GetBool(prop, "tokenizeNLs", this.tokenizeNLs);
                }
                return(new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r, tokenizeNewlines)));
            }
Example #3
0
        /// <summary>Reads a file from the argument and prints its tokens one per line.</summary>
        /// <remarks>
        /// Reads a file from the argument and prints its tokens one per line.
        /// This is mainly as a testing aid, but it can also be quite useful
        /// standalone to turn a corpus into a one token per line file of tokens.
        /// Usage:
        /// <c>java edu.stanford.nlp.process.WhitespaceTokenizer filename</c>
        /// </remarks>
        /// <param name="args">Command line arguments</param>
        /// <exception cref="System.IO.IOException">If can't open files, etc.</exception>
        public static void Main(string[] args)
        {
            bool   eolIsSignificant = (args.Length > 0 && args[0].Equals("-cr"));
            Reader reader           = ((args.Length > 0 && !args[args.Length - 1].Equals("-cr")) ? new InputStreamReader(new FileInputStream(args[args.Length - 1]), "UTF-8") : new InputStreamReader(Runtime.@in, "UTF-8"));
            WhitespaceTokenizer <Word> tokenizer = new WhitespaceTokenizer <Word>(new WordTokenFactory(), reader, eolIsSignificant);
            PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, "UTF-8"), true);

            while (tokenizer.MoveNext())
            {
                Word w = tokenizer.Current;
                if (w.Value().Equals(WhitespaceLexer.Newline))
                {
                    pw.Println("***CR***");
                }
                else
                {
                    pw.Println(w);
                }
            }
        }
 public virtual void TestWordTokenizer()
 {
     RunTest(WhitespaceTokenizer.Factory(false), Test, ResultsNoEol);
     RunTest(WhitespaceTokenizer.Factory(true), Test, ResultsEol);
 }
Example #5
0
 public WordSegmentingTokenizer(IWordSegmenter segmenter, Reader r)
     : this(segmenter, WhitespaceTokenizer.NewCoreLabelWhitespaceTokenizer(r))
 {
 }
Example #6
0
        /// <summary>
        /// usage: java ChineseDocumentToSentenceProcessor [-segmentIBM]
        /// -file filename [-encoding encoding]
        /// <p>
        /// The -segmentIBM option is for IBM GALE-specific splitting of an
        /// XML element into sentences.
        /// </summary>
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            //String encoding = "GB18030";
            Properties props = StringUtils.ArgsToProperties(args);
            // log.info("Here are the properties:");
            // props.list(System.err);
            bool alwaysAddS = props.Contains("alwaysAddS");

            Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor cp;
            if (!props.Contains("file"))
            {
                log.Info("usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] -file filename [-encoding encoding]");
                return;
            }
            cp = new Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor();
            if (props.Contains("encoding"))
            {
                log.Info("WARNING: for now the default encoding is " + cp.encoding + ". It's not changeable for now");
            }
            string input = IOUtils.SlurpFileNoExceptions(props.GetProperty("file"), cp.encoding);

            // String input = StringUtils.slurpGBURLNoExceptions(new URL(props.getProperty("file")));
            if (props.Contains("segmentIBM"))
            {
                ITokenizer <Word> tok         = WhitespaceTokenizer.NewWordWhitespaceTokenizer(new StringReader(input), true);
                string            parseInside = props.GetProperty("parseInside");
                if (parseInside == null)
                {
                    parseInside = string.Empty;
                }
                Pattern       p1;
                Pattern       p2;
                Pattern       p3;
                Pattern       p4;
                PrintWriter   pw       = new PrintWriter(new OutputStreamWriter(System.Console.Out, cp.encoding), true);
                StringBuilder buff     = new StringBuilder();
                StringBuilder sgmlbuff = new StringBuilder();
                string        lastSgml = string.Empty;
                p1 = Pattern.Compile("<.*>");
                p2 = Pattern.Compile("\uFEFF?<[\\p{Alpha}]+");
                p3 = Pattern.Compile("[A-Za-z0-9=\"]+>");
                p4 = Pattern.Compile("<(?:" + parseInside + ")[ >]");
                bool inSGML     = false;
                int  splitItems = 0;
                int  numAdded   = 0;
                while (tok.MoveNext())
                {
                    string s = tok.Current.Word();
                    // pw.println("The token is |" + s + "|");
                    if (p2.Matcher(s).Matches())
                    {
                        inSGML = true;
                        sgmlbuff.Append(s).Append(" ");
                    }
                    else
                    {
                        if (p1.Matcher(s).Matches() || inSGML && p3.Matcher(s).Matches() || "\n".Equals(s))
                        {
                            inSGML = false;
                            if (buff.ToString().Trim().Length > 0)
                            {
                                // pw.println("Dumping sentences");
                                // pw.println("Buff is " + buff);
                                bool processIt = false;
                                if (parseInside.Equals(string.Empty))
                                {
                                    processIt = true;
                                }
                                else
                                {
                                    if (p4.Matcher(lastSgml).Find())
                                    {
                                        processIt = true;
                                    }
                                }
                                if (processIt)
                                {
                                    IList <string> sents = Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor.FromPlainText(buff.ToString(), true);
                                    // pw.println("Sents is " + sents);
                                    // pw.println();
                                    if (alwaysAddS || sents.Count > 1)
                                    {
                                        int i = 1;
                                        foreach (string str in sents)
                                        {
                                            pw.Print("<s id=\"" + i + "\">");
                                            pw.Print(str);
                                            pw.Println("</s>");
                                            i++;
                                        }
                                        if (sents.Count > 1)
                                        {
                                            splitItems++;
                                            numAdded += sents.Count - 1;
                                        }
                                    }
                                    else
                                    {
                                        if (sents.Count == 1)
                                        {
                                            pw.Print(sents[0]);
                                        }
                                    }
                                }
                                else
                                {
                                    pw.Print(buff);
                                }
                                buff = new StringBuilder();
                            }
                            sgmlbuff.Append(s);
                            // pw.println("sgmlbuff is " + sgmlbuff);
                            pw.Print(sgmlbuff);
                            lastSgml = sgmlbuff.ToString();
                            sgmlbuff = new StringBuilder();
                        }
                        else
                        {
                            if (inSGML)
                            {
                                sgmlbuff.Append(s).Append(" ");
                            }
                            else
                            {
                                buff.Append(s).Append(" ");
                            }
                        }
                    }
                }
                // pw.println("Buff is now |" + buff + "|");
                // end while (tok.hasNext()) {
                // empty remaining buffers
                pw.Flush();
                pw.Close();
                log.Info("Split " + splitItems + " segments, adding " + numAdded + " sentences.");
            }
            else
            {
                IList <string> sent = Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor.FromHTML(input);
                PrintWriter    pw   = new PrintWriter(new OutputStreamWriter(System.Console.Error, cp.encoding), true);
                foreach (string a in sent)
                {
                    pw.Println(a);
                }
            }
        }