Example #1
0
        private static int tokReader(TextReader r, TextWriter writer, Regex parseInsidePattern, String options, bool preserveLines, bool dump, bool lowerCase) /*throws IOException*/
        {
            int     numTokens = 0;
            bool    beginLine = true;
            bool    printing  = (parseInsidePattern == null); // start off printing, unless you're looking for a start entity
            Matcher m         = null;

            if (parseInsidePattern != null)
            {
                m = parseInsidePattern.matcher(""); // create once as performance hack
            }
            for (PTBTokenizer <CoreLabel> tokenizer = new PTBTokenizer <CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext();)
            {
                CoreLabel obj = tokenizer.next();
                // String origStr = obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", ""); // DanC added this to fix a lexer bug, hopefully now corrected
                String origStr = (String)obj.get(typeof(CoreAnnotations.TextAnnotation));
                String str;
                if (lowerCase)
                {
                    str = origStr.ToLower(Locale.ENGLISH);
                    obj.set(typeof(CoreAnnotations.TextAnnotation), str);
                }
                else
                {
                    str = origStr;
                }
                if (m != null && m.reset(origStr).matches())
                {
                    printing = m.group(1).isEmpty(); // turn on printing if no end element slash, turn it off it there is
                }
                else if (printing)
                {
                    if (dump)
                    {
                        // after having checked for tags, change str to be exhaustive
                        str = obj.toString();
                    }
                    if (preserveLines)
                    {
                        if (PTBLexer.NEWLINE_TOKEN.equals(origStr))
                        {
                            beginLine = true;
                            writer.newLine();
                        }
                        else
                        {
                            if (!beginLine)
                            {
                                writer.Write(' ');
                            }
                            else
                            {
                                beginLine = false;
                            }
                            // writer.Write(str.replace("\n", ""));
                            writer.Write(str);
                        }
                    }
                    else
                    {
                        writer.Write(str);
                        writer.newLine();
                    }
                }
                numTokens++;
            }
            return(numTokens);
        }