private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd)
        {
            CoreLabel token = new CoreLabel();

            token.SetOriginalText(tokenText);
            if (separatorPattern.Matcher(tokenText).Matches())
            {
                // Map to CoreNLP newline token
                tokenText = AbstractTokenizer.NewlineToken;
            }
            else
            {
                if (doNormalization && normalizeSpace)
                {
                    tokenText = tokenText.Replace(' ', '\u00A0');
                }
            }
            // change space to non-breaking space
            token.SetWord(tokenText);
            token.SetValue(tokenText);
            token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin);
            token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd);
            if (Verbose)
            {
                log.Info("Adding token " + token.ToShorterString());
            }
            return(token);
        }
        /// <exception cref="System.IO.IOException"/>
        private static int TokReader(Reader r, BufferedWriter writer, Pattern parseInsidePattern, Pattern filterPattern, string options, bool preserveLines, bool oneLinePerElement, bool dump, bool lowerCase)
        {
            int  numTokens = 0;
            bool beginLine = true;
            bool printing  = (parseInsidePattern == null);
            // start off printing, unless you're looking for a start entity
            Matcher m = null;

            if (parseInsidePattern != null)
            {
                m = parseInsidePattern.Matcher(string.Empty);
            }
            // create once as performance hack
            // System.err.printf("parseInsidePattern is: |%s|%n", parseInsidePattern);
            for (Edu.Stanford.Nlp.Process.PTBTokenizer <CoreLabel> tokenizer = new Edu.Stanford.Nlp.Process.PTBTokenizer <CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.MoveNext();)
            {
                CoreLabel obj = tokenizer.Current;
                // String origStr = obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", ""); // DanC added this to fix a lexer bug, hopefully now corrected
                string origStr = obj.Get(typeof(CoreAnnotations.TextAnnotation));
                string str;
                if (lowerCase)
                {
                    str = origStr.ToLower(Locale.English);
                    obj.Set(typeof(CoreAnnotations.TextAnnotation), str);
                }
                else
                {
                    str = origStr;
                }
                if (m != null && m.Reset(origStr).Matches())
                {
                    printing = m.Group(1).IsEmpty();
                    // turn on printing if no end element slash, turn it off it there is
                    // System.err.printf("parseInsidePattern matched against: |%s|, printing is %b.%n", origStr, printing);
                    if (!printing)
                    {
                        // true only if matched a stop
                        beginLine = true;
                        if (oneLinePerElement)
                        {
                            writer.NewLine();
                        }
                    }
                }
                else
                {
                    if (printing)
                    {
                        if (dump)
                        {
                            // after having checked for tags, change str to be exhaustive
                            str = obj.ToShorterString();
                        }
                        if (filterPattern != null && filterPattern.Matcher(origStr).Matches())
                        {
                        }
                        else
                        {
                            // skip
                            if (preserveLines)
                            {
                                if (NewlineToken.Equals(origStr))
                                {
                                    beginLine = true;
                                    writer.NewLine();
                                }
                                else
                                {
                                    if (!beginLine)
                                    {
                                        writer.Write(' ');
                                    }
                                    else
                                    {
                                        beginLine = false;
                                    }
                                    // writer.write(str.replace("\n", ""));
                                    writer.Write(str);
                                }
                            }
                            else
                            {
                                if (oneLinePerElement)
                                {
                                    if (!beginLine)
                                    {
                                        writer.Write(' ');
                                    }
                                    else
                                    {
                                        beginLine = false;
                                    }
                                    writer.Write(str);
                                }
                                else
                                {
                                    writer.Write(str);
                                    writer.NewLine();
                                }
                            }
                        }
                    }
                }
                numTokens++;
            }
            return(numTokens);
        }
        private void RunSegmentation(ICoreMap annotation)
        {
            //0 2
            // A BC D E
            // 1 10 1 1
            // 0 12 3 4
            // 0, 0+1 ,
            string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
            // the original text String
            IList <CoreLabel> sentChars = annotation.Get(typeof(SegmenterCoreAnnotations.CharactersAnnotation));

            // the way it was divided by splitCharacters
            if (Verbose)
            {
                log.Info("sentChars (length " + sentChars.Count + ") is " + SentenceUtils.ListToString(sentChars, StringUtils.EmptyStringArray));
            }
            IList <CoreLabel> tokens = new List <CoreLabel>();

            annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            // Run the segmenter! On the whole String. It knows not about the splitting into chars.
            // Can we change this to have it run directly on the already existing list of tokens. That would help, no?
            IList <string> words;

            if (!tokenizeNewline)
            {
                text  = text.ReplaceAll("[\r\n]", string.Empty);
                words = segmenter.SegmentString(text);
            }
            else
            {
                // remove leading and trailing newlines
                text = text.ReplaceAll("^[\\r\\n]+", string.Empty);
                text = text.ReplaceAll("[\\r\\n]+$", string.Empty);
                // if using the sentence split on two newlines option, replace single newlines
                // single newlines should be ignored for segmenting
                if (sentenceSplitOnTwoNewlines)
                {
                    text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
                    // do a second pass to handle corner case of consecutive isolated newlines
                    // x \n x \n x
                    text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
                }
                // Run the segmenter on each line so that we don't get tokens that cross line boundaries
                // Neat trick to keep delimiters from: http://stackoverflow.com/a/2206432
                string[] lines = text.Split(string.Format("((?<=%1$s)|(?=%1$s))", separator));
                words = new List <string>();
                foreach (string line in lines)
                {
                    if (separatorPattern.Matcher(line).Matches())
                    {
                        // Don't segment newline tokens, keep them as-is
                        words.Add(line);
                    }
                    else
                    {
                        Sharpen.Collections.AddAll(words, segmenter.SegmentString(line));
                    }
                }
            }
            if (Verbose)
            {
                log.Info(text + "\n--->\n" + words + " (length " + words.Count + ')');
            }
            // Go through everything again and make the final tokens list; for loop is over segmented words
            int pos = 0;
            // This is used to index sentChars, the output from splitCharacters
            StringBuilder xmlBuffer = new StringBuilder();
            int           xmlBegin  = -1;

            foreach (string w in words)
            {
                CoreLabel fl = sentChars[pos];
                string    xmlCharAnnotation = fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation));
                if (Verbose)
                {
                    log.Info("Working on word " + w + ", sentChar " + fl.ToShorterString() + " (sentChars index " + pos + ')');
                }
                if ("0".Equals(xmlCharAnnotation) || "beginning".Equals(xmlCharAnnotation))
                {
                    // Beginnings of plain text and other XML tags are good places to end an XML tag
                    if (xmlBuffer.Length > 0)
                    {
                        // Form the XML token
                        string    xmlTag = xmlBuffer.ToString();
                        CoreLabel fl1    = sentChars[pos - 1];
                        int       end    = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                        tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end));
                        // Clean up and prepare for the next XML tag
                        xmlBegin  = -1;
                        xmlBuffer = new StringBuilder();
                    }
                }
                if (!"0".Equals(xmlCharAnnotation))
                {
                    // found an XML character; fl changes inside this loop!
                    while (fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)).Equals("whitespace"))
                    {
                        // Print whitespaces into the XML buffer and move on until the next non-whitespace character is found
                        // and we're in sync with segmenter output again
                        xmlBuffer.Append(' ');
                        pos += 1;
                        fl   = sentChars[pos];
                    }
                    xmlBuffer.Append(w);
                    pos = AdvancePos(sentChars, pos, w);
                    if (xmlBegin < 0)
                    {
                        xmlBegin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                    }
                    continue;
                }
                // remember that fl may be more than one char long (non-BMP chars like emoji), so use advancePos()
                fl.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1");
                if (w.IsEmpty())
                {
                    if (Verbose)
                    {
                        log.Warn("Encountered an empty word. Shouldn't happen?");
                    }
                    continue;
                }
                // [cdm 2016:] surely this shouldn't happen!
                int begin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                pos = AdvancePos(sentChars, pos, w);
                if (pos - 1 >= sentChars.Count)
                {
                    log.Error("Error: on word " + w + " at position " + (pos - w.Length) + " trying to get at position " + (pos - 1));
                    log.Error("last element of sentChars is " + sentChars[sentChars.Count - 1]);
                }
                else
                {
                    fl = sentChars[pos - 1];
                    int end = fl.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                    tokens.Add(MakeXmlToken(w, false, begin, end));
                }
            }
            // end for (go through everything again)
            if (xmlBuffer.Length > 0)
            {
                // Form the last XML token, if any
                string    xmlTag = xmlBuffer.ToString();
                CoreLabel fl1    = sentChars[pos - 1];
                int       end    = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end));
            }
            if (Verbose)
            {
                foreach (CoreLabel token in tokens)
                {
                    log.Info(token.ToShorterString());
                }
            }
        }