C# (CSharp) CoreLabel.ToShorterString Examples

Programming Language: C# (CSharp)

Class/Type: CoreLabel

Method/Function: ToShorterString

Examples at hotexamples.com: 3

C# (CSharp) CoreLabel.ToShorterString - 3 examples found. These are the top rated real world C# (CSharp) examples of CoreLabel.ToShorterString extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Set(30)

Word(30)

Get(30)

SetWord(27)

SetValue(27)

Lemma(15)

SetTag(13)

Tag(12)

ContainsKey(11)

GetString(10)

Index(9)

SetIndex(9)

Value(9)

Factory(9)

Remove(8)

SetNER(7)

BeginPosition(6)

SetLemma(6)

SetOriginalText(5)

ToString(4)

get(4)

SetBeginPosition(4)

OriginalText(4)

SetEndPosition(4)

KeySet(3)

ToShorterString(3)

value(2)

Ner(2)

IsNewline(2)

EndPosition(2)

toString(1)

set(1)

lemma(1)

index(1)

SetCategory(1)

endPosition(1)

beginPosition(1)

GetHashCode(1)

Size(1)

LabelFactory(1)

Category(1)

word(1)

Example #1

Show file

File: ChineseSegmenterAnnotator.cs Project: awesomedotnetcore/Stanford.CoreNLP.NET

        private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd)
        {
            CoreLabel token = new CoreLabel();

            token.SetOriginalText(tokenText);
            if (separatorPattern.Matcher(tokenText).Matches())
            {
                // Map to CoreNLP newline token
                tokenText = AbstractTokenizer.NewlineToken;
            }
            else
            {
                if (doNormalization && normalizeSpace)
                {
                    tokenText = tokenText.Replace(' ', '\u00A0');
                }
            }
            // change space to non-breaking space
            token.SetWord(tokenText);
            token.SetValue(tokenText);
            token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin);
            token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd);
            if (Verbose)
            {
                log.Info("Adding token " + token.ToShorterString());
            }
            return(token);
        }

Example #2

Show file

File: PTBTokenizer.cs Project: awesomedotnetcore/Stanford.CoreNLP.NET

        /// <exception cref="System.IO.IOException"/>
        private static int TokReader(Reader r, BufferedWriter writer, Pattern parseInsidePattern, Pattern filterPattern, string options, bool preserveLines, bool oneLinePerElement, bool dump, bool lowerCase)
        {
            int  numTokens = 0;
            bool beginLine = true;
            bool printing  = (parseInsidePattern == null);
            // start off printing, unless you're looking for a start entity
            Matcher m = null;

            if (parseInsidePattern != null)
            {
                m = parseInsidePattern.Matcher(string.Empty);
            }
            // create once as performance hack
            // System.err.printf("parseInsidePattern is: |%s|%n", parseInsidePattern);
            for (Edu.Stanford.Nlp.Process.PTBTokenizer <CoreLabel> tokenizer = new Edu.Stanford.Nlp.Process.PTBTokenizer <CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.MoveNext();)
            {
                CoreLabel obj = tokenizer.Current;
                // String origStr = obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", ""); // DanC added this to fix a lexer bug, hopefully now corrected
                string origStr = obj.Get(typeof(CoreAnnotations.TextAnnotation));
                string str;
                if (lowerCase)
                {
                    str = origStr.ToLower(Locale.English);
                    obj.Set(typeof(CoreAnnotations.TextAnnotation), str);
                }
                else
                {
                    str = origStr;
                }
                if (m != null && m.Reset(origStr).Matches())
                {
                    printing = m.Group(1).IsEmpty();
                    // turn on printing if no end element slash, turn it off it there is
                    // System.err.printf("parseInsidePattern matched against: |%s|, printing is %b.%n", origStr, printing);
                    if (!printing)
                    {
                        // true only if matched a stop
                        beginLine = true;
                        if (oneLinePerElement)
                        {
                            writer.NewLine();
                        }
                    }
                }
                else
                {
                    if (printing)
                    {
                        if (dump)
                        {
                            // after having checked for tags, change str to be exhaustive
                            str = obj.ToShorterString();
                        }
                        if (filterPattern != null && filterPattern.Matcher(origStr).Matches())
                        {
                        }
                        else
                        {
                            // skip
                            if (preserveLines)
                            {
                                if (NewlineToken.Equals(origStr))
                                {
                                    beginLine = true;
                                    writer.NewLine();
                                }
                                else
                                {
                                    if (!beginLine)
                                    {
                                        writer.Write(' ');
                                    }
                                    else
                                    {
                                        beginLine = false;
                                    }
                                    // writer.write(str.replace("\n", ""));
                                    writer.Write(str);
                                }
                            }
                            else
                            {
                                if (oneLinePerElement)
                                {
                                    if (!beginLine)
                                    {
                                        writer.Write(' ');
                                    }
                                    else
                                    {
                                        beginLine = false;
                                    }
                                    writer.Write(str);
                                }
                                else
                                {
                                    writer.Write(str);
                                    writer.NewLine();
                                }
                            }
                        }
                    }
                }
                numTokens++;
            }
            return(numTokens);
        }

Example #3

Show file

File: ChineseSegmenterAnnotator.cs Project: awesomedotnetcore/Stanford.CoreNLP.NET

        private void RunSegmentation(ICoreMap annotation)
        {
            //0 2
            // A BC D E
            // 1 10 1 1
            // 0 12 3 4
            // 0, 0+1 ,
            string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation));
            // the original text String
            IList <CoreLabel> sentChars = annotation.Get(typeof(SegmenterCoreAnnotations.CharactersAnnotation));

            // the way it was divided by splitCharacters
            if (Verbose)
            {
                log.Info("sentChars (length " + sentChars.Count + ") is " + SentenceUtils.ListToString(sentChars, StringUtils.EmptyStringArray));
            }
            IList <CoreLabel> tokens = new List <CoreLabel>();

            annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            // Run the segmenter! On the whole String. It knows not about the splitting into chars.
            // Can we change this to have it run directly on the already existing list of tokens. That would help, no?
            IList <string> words;

            if (!tokenizeNewline)
            {
                text  = text.ReplaceAll("[\r\n]", string.Empty);
                words = segmenter.SegmentString(text);
            }
            else
            {
                // remove leading and trailing newlines
                text = text.ReplaceAll("^[\\r\\n]+", string.Empty);
                text = text.ReplaceAll("[\\r\\n]+$", string.Empty);
                // if using the sentence split on two newlines option, replace single newlines
                // single newlines should be ignored for segmenting
                if (sentenceSplitOnTwoNewlines)
                {
                    text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
                    // do a second pass to handle corner case of consecutive isolated newlines
                    // x \n x \n x
                    text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
                }
                // Run the segmenter on each line so that we don't get tokens that cross line boundaries
                // Neat trick to keep delimiters from: http://stackoverflow.com/a/2206432
                string[] lines = text.Split(string.Format("((?<=%1$s)|(?=%1$s))", separator));
                words = new List <string>();
                foreach (string line in lines)
                {
                    if (separatorPattern.Matcher(line).Matches())
                    {
                        // Don't segment newline tokens, keep them as-is
                        words.Add(line);
                    }
                    else
                    {
                        Sharpen.Collections.AddAll(words, segmenter.SegmentString(line));
                    }
                }
            }
            if (Verbose)
            {
                log.Info(text + "\n--->\n" + words + " (length " + words.Count + ')');
            }
            // Go through everything again and make the final tokens list; for loop is over segmented words
            int pos = 0;
            // This is used to index sentChars, the output from splitCharacters
            StringBuilder xmlBuffer = new StringBuilder();
            int           xmlBegin  = -1;

            foreach (string w in words)
            {
                CoreLabel fl = sentChars[pos];
                string    xmlCharAnnotation = fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation));
                if (Verbose)
                {
                    log.Info("Working on word " + w + ", sentChar " + fl.ToShorterString() + " (sentChars index " + pos + ')');
                }
                if ("0".Equals(xmlCharAnnotation) || "beginning".Equals(xmlCharAnnotation))
                {
                    // Beginnings of plain text and other XML tags are good places to end an XML tag
                    if (xmlBuffer.Length > 0)
                    {
                        // Form the XML token
                        string    xmlTag = xmlBuffer.ToString();
                        CoreLabel fl1    = sentChars[pos - 1];
                        int       end    = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                        tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end));
                        // Clean up and prepare for the next XML tag
                        xmlBegin  = -1;
                        xmlBuffer = new StringBuilder();
                    }
                }
                if (!"0".Equals(xmlCharAnnotation))
                {
                    // found an XML character; fl changes inside this loop!
                    while (fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)).Equals("whitespace"))
                    {
                        // Print whitespaces into the XML buffer and move on until the next non-whitespace character is found
                        // and we're in sync with segmenter output again
                        xmlBuffer.Append(' ');
                        pos += 1;
                        fl   = sentChars[pos];
                    }
                    xmlBuffer.Append(w);
                    pos = AdvancePos(sentChars, pos, w);
                    if (xmlBegin < 0)
                    {
                        xmlBegin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                    }
                    continue;
                }
                // remember that fl may be more than one char long (non-BMP chars like emoji), so use advancePos()
                fl.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1");
                if (w.IsEmpty())
                {
                    if (Verbose)
                    {
                        log.Warn("Encountered an empty word. Shouldn't happen?");
                    }
                    continue;
                }
                // [cdm 2016:] surely this shouldn't happen!
                int begin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                pos = AdvancePos(sentChars, pos, w);
                if (pos - 1 >= sentChars.Count)
                {
                    log.Error("Error: on word " + w + " at position " + (pos - w.Length) + " trying to get at position " + (pos - 1));
                    log.Error("last element of sentChars is " + sentChars[sentChars.Count - 1]);
                }
                else
                {
                    fl = sentChars[pos - 1];
                    int end = fl.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                    tokens.Add(MakeXmlToken(w, false, begin, end));
                }
            }
            // end for (go through everything again)
            if (xmlBuffer.Length > 0)
            {
                // Form the last XML token, if any
                string    xmlTag = xmlBuffer.ToString();
                CoreLabel fl1    = sentChars[pos - 1];
                int       end    = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end));
            }
            if (Verbose)
            {
                foreach (CoreLabel token in tokens)
                {
                    log.Info(token.ToShorterString());
                }
            }
        }