/// <summary> /// Add tokens (and types if required) that are computed for src to _tokensList/_typesList. /// </summary> private void Tokenize(ref DvText src, Language lang, GetSpansSimpleCallback addTokenAndType) { Contracts.Assert(_typesList == null || _tokensList.Count == _typesList.Count); if (!src.HasChars) return; int ichMin; int ichLim; string text = src.GetRawUnderlyingBufferInfo(out ichMin, out ichLim); Tokenizers[(int)lang].GetSpansSimple(text, ichMin, ichLim - ichMin, addTokenAndType); Contracts.Assert(_typesList == null || _tokensList.Count == _typesList.Count); }
/// <summary> /// Converts a DvText to a StringBuilder using TextSaver escaping and string quoting rules. /// </summary> internal static void MapText(ref DvText src, ref StringBuilder sb, char sep) { if (sb == null) { sb = new StringBuilder(); } else { sb.Clear(); } if (src.IsEmpty) { sb.Append("\"\""); } else if (!src.IsNA) { int ichMin; int ichLim; string text = src.GetRawUnderlyingBufferInfo(out ichMin, out ichLim); int ichCur = ichMin; int ichRun = ichCur; bool quoted = false; // Strings that start with space need to be quoted. Contracts.Assert(ichCur < ichLim); if (text[ichCur] == ' ') { quoted = true; sb.Append('"'); } for (; ichCur < ichLim; ichCur++) { char ch = text[ichCur]; if (ch != '"' && ch != sep && ch != ':') { continue; } if (!quoted) { Contracts.Assert(ichRun == ichMin); sb.Append('"'); quoted = true; } if (ch == '"') { if (ichRun < ichCur) { sb.Append(text, ichRun, ichCur - ichRun); } sb.Append("\"\""); ichRun = ichCur + 1; } } Contracts.Assert(ichCur == ichLim); if (ichRun < ichCur) { sb.Append(text, ichRun, ichCur - ichRun); } if (quoted) { sb.Append('"'); } } }
private void NormalizeSrc(ref DvText src, ref DvText dst, StringBuilder buffer) { Host.AssertValue(buffer); if (!src.HasChars) { dst = src; return; } buffer.Clear(); int ichMin; int ichLim; string text = src.GetRawUnderlyingBufferInfo(out ichMin, out ichLim); int i = ichMin; int min = ichMin; while (i < ichLim) { char ch = text[i]; if (!_keepPunctuations && char.IsPunctuation(ch) || !_keepNumbers && char.IsNumber(ch)) { // Append everything before ch and ignore ch. buffer.Append(text, min, i - min); min = i + 1; i++; continue; } if (!_keepDiacritics) { if (IsCombiningDiacritic(ch)) { buffer.Append(text, min, i - min); min = i + 1; i++; continue; } if (CombinedDiacriticsMap.ContainsKey(ch)) { ch = CombinedDiacriticsMap[ch]; } } if (_case == CaseNormalizationMode.Lower) { ch = CharUtils.ToLowerInvariant(ch); } else if (_case == CaseNormalizationMode.Upper) { ch = CharUtils.ToUpperInvariant(ch); } if (ch != text[i]) { buffer.Append(text, min, i - min).Append(ch); min = i + 1; } i++; } Host.Assert(i == ichLim); int len = i - min; if (ichMin == min) { Host.Assert(src.Length == len); dst = src; } else { buffer.Append(text, min, len); dst = new DvText(buffer.ToString()); } }