/// <summary>
            /// Add tokens (and types if required) that are computed for src to _tokensList/_typesList.
            /// </summary>
            private void Tokenize(ref DvText src, Language lang, GetSpansSimpleCallback addTokenAndType)
            {
                Contracts.Assert(_typesList == null || _tokensList.Count == _typesList.Count);

                if (!src.HasChars)
                    return;

                int ichMin;
                int ichLim;
                string text = src.GetRawUnderlyingBufferInfo(out ichMin, out ichLim);
                Tokenizers[(int)lang].GetSpansSimple(text, ichMin, ichLim - ichMin, addTokenAndType);
                Contracts.Assert(_typesList == null || _tokensList.Count == _typesList.Count);
            }
Example #2
0
        /// <summary>
        /// Converts a DvText to a StringBuilder using TextSaver escaping and string quoting rules.
        /// </summary>
        internal static void MapText(ref DvText src, ref StringBuilder sb, char sep)
        {
            if (sb == null)
            {
                sb = new StringBuilder();
            }
            else
            {
                sb.Clear();
            }

            if (src.IsEmpty)
            {
                sb.Append("\"\"");
            }
            else if (!src.IsNA)
            {
                int    ichMin;
                int    ichLim;
                string text   = src.GetRawUnderlyingBufferInfo(out ichMin, out ichLim);
                int    ichCur = ichMin;
                int    ichRun = ichCur;
                bool   quoted = false;

                // Strings that start with space need to be quoted.
                Contracts.Assert(ichCur < ichLim);
                if (text[ichCur] == ' ')
                {
                    quoted = true;
                    sb.Append('"');
                }

                for (; ichCur < ichLim; ichCur++)
                {
                    char ch = text[ichCur];
                    if (ch != '"' && ch != sep && ch != ':')
                    {
                        continue;
                    }
                    if (!quoted)
                    {
                        Contracts.Assert(ichRun == ichMin);
                        sb.Append('"');
                        quoted = true;
                    }
                    if (ch == '"')
                    {
                        if (ichRun < ichCur)
                        {
                            sb.Append(text, ichRun, ichCur - ichRun);
                        }
                        sb.Append("\"\"");
                        ichRun = ichCur + 1;
                    }
                }
                Contracts.Assert(ichCur == ichLim);
                if (ichRun < ichCur)
                {
                    sb.Append(text, ichRun, ichCur - ichRun);
                }
                if (quoted)
                {
                    sb.Append('"');
                }
            }
        }
Example #3
0
        private void NormalizeSrc(ref DvText src, ref DvText dst, StringBuilder buffer)
        {
            Host.AssertValue(buffer);

            if (!src.HasChars)
            {
                dst = src;
                return;
            }

            buffer.Clear();

            int    ichMin;
            int    ichLim;
            string text = src.GetRawUnderlyingBufferInfo(out ichMin, out ichLim);
            int    i    = ichMin;
            int    min  = ichMin;

            while (i < ichLim)
            {
                char ch = text[i];
                if (!_keepPunctuations && char.IsPunctuation(ch) || !_keepNumbers && char.IsNumber(ch))
                {
                    // Append everything before ch and ignore ch.
                    buffer.Append(text, min, i - min);
                    min = i + 1;
                    i++;
                    continue;
                }

                if (!_keepDiacritics)
                {
                    if (IsCombiningDiacritic(ch))
                    {
                        buffer.Append(text, min, i - min);
                        min = i + 1;
                        i++;
                        continue;
                    }

                    if (CombinedDiacriticsMap.ContainsKey(ch))
                    {
                        ch = CombinedDiacriticsMap[ch];
                    }
                }

                if (_case == CaseNormalizationMode.Lower)
                {
                    ch = CharUtils.ToLowerInvariant(ch);
                }
                else if (_case == CaseNormalizationMode.Upper)
                {
                    ch = CharUtils.ToUpperInvariant(ch);
                }

                if (ch != text[i])
                {
                    buffer.Append(text, min, i - min).Append(ch);
                    min = i + 1;
                }

                i++;
            }

            Host.Assert(i == ichLim);
            int len = i - min;

            if (ichMin == min)
            {
                Host.Assert(src.Length == len);
                dst = src;
            }
            else
            {
                buffer.Append(text, min, len);
                dst = new DvText(buffer.ToString());
            }
        }