C# (CSharp) ISpan.AddToken Examples

Programming Language: C# (CSharp)

Class/Type: ISpan

Method/Function: AddToken

Examples at hotexamples.com: 3

C# (CSharp) ISpan.AddToken - 3 examples found. These are the top rated real world C# (CSharp) examples of ISpan.AddToken extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Log(30)

Finish(30)

End(30)

PutAttribute(20)

Dispose(15)

AddLink(14)

CaptureException(10)

GetBaggageItem(9)

AddEvent(8)

Detach(6)

PutAttributes(6)

AddAnnotation(6)

PutHttpStatusCodeAttribute(6)

AddMessageEvent(5)

CollapseTextSpans(5)

GetTag(4)

LogError(4)

AddToken(3)

PutHttpResponseHeadersAttribute(3)

PutHttpRawUrlAttribute(3)

GetEnumerator(3)

PutHttpPathAttribute(3)

PutHttpMethodAttribute(3)

PutHttpHostAttribute(3)

DidNotReceive(3)

LogMessage(3)

PutHttpResponseSizeAttribute(2)

PutHttpRequestSizeAttribute(2)

PutErrorAttribute(2)

AnnotateSpan(2)

CaptureError(2)

GetType(2)

GetSpanContext(2)

AddCustomAttribute(2)

Begin(1)

HorizontalLine(1)

Get(1)

ApplyDistributedTracePayload(1)

ErrorOccurred(1)

Dump(1)

Example #1

Show file

File: SpaceTokenizer.cs Project: yasmineChelly-95/catalyst

        public void Parse(ISpan span)
        {
            var textSpan = span.ValueAsSpan;
            int spanBegin = span.Begin;
            int begin = 0, end = textSpan.IndexOf(' ');

            while (end >= 0)
            {
                if (!textSpan.Slice(0, end).IsNullOrWhiteSpace())
                {
                    span.AddToken(spanBegin + begin, spanBegin + begin + end - 1);
                }
                textSpan = textSpan.Slice(end + 1);
                begin   += end + 1;
                end      = textSpan.IndexOf(' ');
            }

            if (begin < span.Length)
            {
                if (!span.ValueAsSpan.Slice(begin, span.Length - begin).IsNullOrWhiteSpace())
                {
                    span.AddToken(spanBegin + begin, spanBegin + span.Length - 1);
                }
            }
        }

Example #2

Show file

        private static List <IDocument> ReadCorpus(List <string> trainDocuments, HashSet <string> arcNames, Language language, bool isOntoNotes = false)
        {
            if (trainDocuments is null)
            {
                return(new List <IDocument>());
            }
            var allLines = new List <string>();

            foreach (var f in trainDocuments)
            {
                if (isOntoNotes)
                {
                    allLines.Add("# newdoc");  //Force doc splits
                    allLines.Add("# sent_id"); //Force doc splits
                    allLines.AddRange(File.ReadAllLines(f).Select(l => string.IsNullOrWhiteSpace(l) ? "# sent_id" : l));
                }
                else
                {
                    allLines.AddRange(File.ReadAllLines(f).Where(l => !string.IsNullOrWhiteSpace(l)));
                }
            }

            var documents = new List <IDocument>();


            var docLines = new List <List <string> >();

            foreach (var line in allLines)
            {
                if (line.StartsWith("# newdoc"))
                {
                    docLines.Add(new List <string>());
                }
                else
                {
                    if (docLines.Count == 0)
                    {
                        docLines.Add(new List <string>());
                    }
                    docLines.Last().Add(line);
                }
            }

            foreach (var docline in docLines)
            {
                var  doc        = new Document();
                bool invalidDoc = false;

                ISpan span = null;
                var   sb   = new StringBuilder();
                foreach (var l in docline)
                {
                    if (l.StartsWith("# sent_id"))
                    {
                        span = doc.AddSpan(sb.Length, sb.Length);
                        //if(l.Contains("email-enronsent")) { invalidDoc = true; }
                    }
                    else if (!l.StartsWith("#"))
                    {
                        var parts = l.Split('\t');
                        if (parts[0].Contains("-"))
                        {
                            continue;
                        }                                         //Pseudo-token, such as cannot -> proceed by can + not

                        double index;
                        if (double.TryParse(parts[0], out index))
                        {
                            if ((int)(index * 10) == ((int)index) * 10)
                            {
                                string       txt   = parts[1];
                                string       lemma = parts[2];
                                string       pos   = parts[3];
                                PartOfSpeech POS;
                                bool         spaceAfter = false;


                                if (isOntoNotes)
                                {
                                    POS = PartOfSpeechHelpers.EnglishPennToUniversal[pos];
                                }
                                else
                                {
                                    POS        = (PartOfSpeech)Enum.Parse(typeof(PartOfSpeech), pos);
                                    spaceAfter = parts[9].Contains("SpaceAfter=No");
                                }
                                //if (PartOfSpeechHelpers.StringPOS.Contains(pos))
                                //{
                                //    POS = (PartOfSpeechEnum)Enum.Parse(typeof(PartOfSpeechEnum), pos);
                                //}
                                //else
                                //{
                                //    if (language == LanguageEnum.English)
                                //    {
                                //        if (!PartOfSpeechHelpers.EnglishPennToUniversal.TryGetValue(pos, out POS))
                                //        {
                                //            throw new Exception("Invalid tag: " + pos);
                                //        }
                                //    }
                                //    else
                                //    {
                                //        throw new Exception("Invalid tag: " + pos);
                                //    }
                                //}

                                if (language == Language.English)
                                {
                                    //Should add more exceptions here on how we handle tokenization differently than the original Conll data
                                    if ((txt.ToLowerInvariant() == "'s" || txt.ToLowerInvariant() == "s") && (lemma.ToLowerInvariant() == "be" || POS == PartOfSpeech.VERB || POS == PartOfSpeech.AUX))
                                    {
                                        txt = "is";
                                    }
                                    else if ((txt.ToLowerInvariant() == "'m" || txt.ToLowerInvariant() == "m") && (lemma.ToLowerInvariant() == "be" || POS == PartOfSpeech.VERB || POS == PartOfSpeech.AUX))
                                    {
                                        txt = "am";
                                    }
                                    else if ((txt.ToLowerInvariant() == "'re" || txt.ToLowerInvariant() == "re") && (lemma.ToLowerInvariant() == "be" || POS == PartOfSpeech.VERB))
                                    {
                                        txt = "are";
                                    }
                                    else if ((txt.ToLowerInvariant() == "ll" || txt.ToLowerInvariant() == "'ll") && (POS == PartOfSpeech.VERB || POS == PartOfSpeech.AUX))
                                    {
                                        txt = "will";
                                    }
                                    else if (txt.ToLowerInvariant() == "'d" && (POS == PartOfSpeech.AUX))
                                    {
                                        txt = "would";
                                    }
                                    else if (txt.ToLowerInvariant() == "'d" && (POS == PartOfSpeech.VERB))
                                    {
                                        txt = "had";
                                    }
                                    else if (txt.ToLowerInvariant() == "n't")
                                    {
                                        txt = "not";
                                    }
                                    else if (txt.ToLowerInvariant() == "'ve")
                                    {
                                        txt = "have";
                                    }
                                    else if (txt.Length > 1 && txt.StartsWith("/") && pos == ".")
                                    {
                                        txt = txt.Substring(1);
                                    }
                                    else if (txt == "'" && lemma == "'s" && (POS == PartOfSpeech.PART || POS == PartOfSpeech.PRON))
                                    {
                                        // ok
                                    }
                                    else if (txt.StartsWith("'") && !(txt == "'s" && POS == PartOfSpeech.PART) &&
                                             !(txt == "'" && POS == PartOfSpeech.PART) &&
                                             !(txt == "'s" && POS == PartOfSpeech.PRON) &&
                                             !(txt == "'" && POS == PartOfSpeech.PUNCT))
                                    {
                                        File.AppendAllLines("missing_contractions.txt", new string[] { l.Split(new char[] { '\t' }, 2).Last() });
                                    }
                                    else if (lemma == "#hlink#" && txt.Contains("://"))
                                    {
                                        txt = "http://" + txt;
                                    }
                                }
                                //'d

                                int begin = sb.Length;
                                int end   = begin + txt.Length - 1;
                                sb.Append(txt + " ");
                                span.End = sb.Length - 1;
                                var token = span.AddToken(begin, end);
                                token.POS = POS;
                                int    head    = int.Parse(parts[isOntoNotes ? 5 : 6]) - 1;
                                string arcType = parts[isOntoNotes ? 6 : 7].ToLowerInvariant().Split(':').First();

                                //if (parts[5].Contains("Foreign=Yes"))
                                //{
                                //    invalidDoc = true;
                                //}

                                token.Head           = head;
                                token.DependencyType = arcType;

                                if (!arcNames.Contains(arcType))
                                {
                                    arcNames.Add(arcType);
                                }
                            }
                        }
                    }
                }
                doc.Value = sb.ToString();
                doc.TrimTokens();
                if (!invalidDoc)
                {
                    documents.Add(doc);
                }
                else
                {
                    Logger.LogInformation("skipping document:\n" + doc.TokenizedValue + "\n");
                }
            }


            return(documents);
        }

Example #3

Show file

        public void Parse(ISpan span)
        {
            //TODO: store if a splitpoint is special case, do not try to fetch hash if not!
            var separators = CharacterClasses.WhitespaceCharacters;
            var textSpan   = span.ValueAsSpan;

            bool hasEmoji = false;

            for (int i = 0; i < textSpan.Length - 1; i++)
            {
                if (textSpan.Slice(i).IsEmoji(out _))
                {
                    hasEmoji = true; break;
                }
            }

            var splitPoints = new List <SplitPoint>(textSpan.Length / 4);

            int offset = 0, sufix_offset = 0;

            while (true)
            {
                if (splitPoints.Count > textSpan.Length)
                {
                    throw new InvalidOperationException(); //If we found more splitting points than actual characters on the span, we hit a bug in the tokenizer
                }

                offset      += sufix_offset;
                sufix_offset = 0;
                if (offset > textSpan.Length)
                {
                    break;
                }
                var splitPoint = textSpan.IndexOfAny(separators, offset);
                ReadOnlySpan <char> candidate;

                if (splitPoint == offset)
                {
                    //Happens on sequential separators
                    offset++; continue;
                }

                if (splitPoint < 0)
                {
                    candidate  = textSpan.Slice(offset);
                    splitPoint = offset + candidate.Length;
                    if (candidate.Length == 0)
                    {
                        break;
                    }
                }
                else
                {
                    candidate = textSpan.Slice(offset, splitPoint - offset);
                }

                //Special case to split also at emojis
                if (hasEmoji)
                {
                    for (int i = 0; i < (candidate.Length - 1); i++)
                    {
                        if (candidate.Slice(i).IsEmoji(out var emojiLength))
                        {
                            if (i == 0)
                            {
                                splitPoint = offset + emojiLength - 1;
                                candidate  = candidate.Slice(0, emojiLength);
                            }
                            else
                            {
                                splitPoint = offset + i - 1;
                                candidate  = candidate.Slice(0, i);
                            }
                            break;
                        }
                    }
                }

                while (!candidate.IsEmpty)
                {
                    int hash = candidate.CaseSensitiveHash32();
                    if (SpecialCases.ContainsKey(hash))
                    {
                        splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.Exception));
                        candidate = new ReadOnlySpan <char>();
                        offset    = splitPoint + 1;
                        continue;
                    }
                    else if (candidate.IsLikeURLorEmail())
                    {
                        splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.EmailOrUrl));
                        candidate = new ReadOnlySpan <char>();
                        offset    = splitPoint + 1;
                        continue;
                    }
                    else if (hasEmoji && candidate.IsEmoji(out var emojiLength))
                    {
                        splitPoints.Add(new SplitPoint(offset, offset + emojiLength - 1, SplitPointReason.Emoji));
                        candidate = candidate.Slice(emojiLength);
                        offset   += emojiLength;
                        continue;
                    }
                    else
                    {
                        if (candidate.Length == 1)
                        {
                            splitPoints.Add(new SplitPoint(offset, offset, SplitPointReason.SingleChar));
                            candidate = new ReadOnlySpan <char>();
                            offset    = splitPoint + 1;
                            continue;
                        }

                        if (!candidate.IsAllLetterOrDigit())
                        {
                            if (candidate.IsSentencePunctuation() || candidate.IsHyphen() || candidate.IsSymbol())
                            {
                                splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.Punctuation));
                                candidate = new ReadOnlySpan <char>();
                                offset    = splitPoint + 1;
                                continue;
                            }

                            int prefixLocation = FindPrefix(candidate);
                            if (prefixLocation >= 0)
                            {
                                splitPoints.Add(new SplitPoint(offset + prefixLocation, offset + prefixLocation, SplitPointReason.Prefix));
                                candidate = candidate.Slice(prefixLocation + 1);
                                offset   += prefixLocation + 1;
                                continue;
                            }

                            var(sufixIndex, sufixLength) = FindSufix(candidate);

                            if (sufixIndex > -1)
                            {
                                splitPoints.Add(new SplitPoint(offset + sufixIndex, offset + sufixIndex + sufixLength - 1, SplitPointReason.Sufix));
                                candidate     = candidate.Slice(0, sufixIndex);
                                splitPoint    = offset + sufixIndex;
                                sufix_offset += sufixLength;
                                continue;
                            }

                            var infixLocation = FindInfix(candidate);
                            if (infixLocation.Count > 0)
                            {
                                int in_offset = offset;

                                foreach (var(index, length) in infixLocation)
                                {
                                    if ((offset + index - 1) >= in_offset)
                                    {
                                        splitPoints.Add(new SplitPoint(in_offset, offset + index - 1, SplitPointReason.Infix));
                                    }

                                    //Test if the remaining is not an exception first
                                    if ((in_offset - offset + index) <= candidate.Length)
                                    {
                                        var rest     = candidate.Slice(in_offset - offset + index);
                                        int hashRest = rest.CaseSensitiveHash32();

                                        if (SpecialCases.ContainsKey(hashRest))
                                        {
                                            in_offset = offset + index;
                                            break;
                                        }
                                    }
                                    in_offset = offset + index + length;
                                    splitPoints.Add(new SplitPoint(offset + index, offset + index + length - 1, SplitPointReason.Infix));
                                }

                                candidate = candidate.Slice(in_offset - offset);

                                offset = in_offset;
                                continue;
                            }
                        }
                    }

                    splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.Normal));
                    candidate = new ReadOnlySpan <char>();
                    offset    = splitPoint + 1;
                }
            }

            int spanBegin = span.Begin;
            int pB = int.MinValue, pE = int.MinValue;

            span.ReserveTokens(splitPoints.Count);
            foreach (var sp in splitPoints.OrderBy(s => s.Begin).ThenBy(s => s.End))
            {
                int b = sp.Begin;
                int e = sp.End;

                if (pB == b && pE == e)
                {
                    continue;
                }
                pB = b; pE = e;

                if (b > e)
                {
                    Logger.LogError("Error processing text: '{DOC}', found token with begin={b} and end={e}", span.Value, b, e);
                    throw new InvalidOperationException();
                }

                while (char.IsWhiteSpace(textSpan[b]) && b < e)
                {
                    b++;
                }

                while (char.IsWhiteSpace(textSpan[e]) && e > b)
                {
                    e--;
                }

                int hash = textSpan.Slice(b, e - b + 1).CaseSensitiveHash32();

                if (e < b)
                {
                    Logger.LogError("Error processing text: '{DOC}', found token with begin={b} and end={e}", span.Value, b, e);
                    continue;
                }

                if (SpecialCases.TryGetValue(hash, out TokenizationException exp))
                {
                    if (exp.Replacements is null)
                    {
                        var tk = span.AddToken(spanBegin + b, spanBegin + e);
                    }
                    else
                    {
                        //TODO: Tokens begins and ends are being artificially placed here, check in the future how to better handle this
                        int begin2 = spanBegin + b;
                        for (int i = 0; i < exp.Replacements.Length; i++)
                        {
                            //Adds replacement tokens sequentially, consuming one char from the original document at a time, and
                            //using the remaing chars in the last replacement token
                            var tk = span.AddToken(begin2, ((i == exp.Replacements.Length - 1) ? (spanBegin + e) : begin2));
                            tk.Replacement = exp.Replacements[i];
                            begin2++;
                        }
                    }
                }
                else
                {
                    var tk = span.AddToken(spanBegin + b, spanBegin + e);
                    if (sp.Reason == SplitPointReason.EmailOrUrl && !DisableEmailOrURLCapture)
                    {
                        tk.AddEntityType(new EntityType("EmailOrURL", EntityTag.Single));
                    }
                }
            }
        }