public static TX Lower(TX a) { if (a.IsEmpty) { return(a); } var sb = new StringBuilder(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(a.Span, sb); return(sb.ToString().AsMemory()); }
protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer) { Host.AssertValueOrNull(ch); Host.AssertValue(input); Host.Assert(0 <= iinfo && iinfo < Infos.Length); Host.Assert(Infos[iinfo].TypeSrc.IsVector & Infos[iinfo].TypeSrc.ItemType.IsText); disposer = null; var ex = _exes[iinfo]; Language stopWordslang = ex.Lang; var lang = default(ReadOnlyMemory <char>); var getLang = ex.LangsColIndex >= 0 ? input.GetGetter <ReadOnlyMemory <char> >(ex.LangsColIndex) : null; var getSrc = GetSrcGetter <VBuffer <ReadOnlyMemory <char> > >(input, iinfo); var src = default(VBuffer <ReadOnlyMemory <char> >); var buffer = new StringBuilder(); var list = new List <ReadOnlyMemory <char> >(); ValueGetter <VBuffer <ReadOnlyMemory <char> > > del = (ref VBuffer <ReadOnlyMemory <char> > dst) => { var langToUse = stopWordslang; UpdateLanguage(ref langToUse, getLang, ref lang); getSrc(ref src); list.Clear(); var srcValues = src.GetValues(); for (int i = 0; i < srcValues.Length; i++) { if (srcValues[i].IsEmpty) { continue; } buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(srcValues[i].Span, buffer); // REVIEW nihejazi: Consider using a trie for string matching (Aho-Corasick, etc.) if (StopWords[(int)langToUse].Get(buffer) == null) { list.Add(srcValues[i]); } } VBufferUtils.Copy(list, ref dst, list.Count); }; return(del); }
private void LoadStopWords(IHostEnvironment env, IChannel ch, ArgumentsBase loaderArgs, out NormStr.Pool stopWordsMap) { Contracts.AssertValue(env); env.AssertValue(ch); ch.AssertValue(loaderArgs); if ((!string.IsNullOrEmpty(loaderArgs.Stopwords) || Utils.Size(loaderArgs.Stopword) > 0) && (!string.IsNullOrWhiteSpace(loaderArgs.DataFile) || loaderArgs.Loader != null || !string.IsNullOrWhiteSpace(loaderArgs.StopwordsColumn))) { ch.Warning("Explicit stopwords list specified. Data file arguments will be ignored"); } var src = default(ReadOnlyMemory <char>); stopWordsMap = new NormStr.Pool(); var buffer = new StringBuilder(); var stopwords = loaderArgs.Stopwords.AsMemory(); stopwords = ReadOnlyMemoryUtils.TrimSpaces(stopwords); if (!stopwords.IsEmpty) { bool warnEmpty = true; for (bool more = true; more;) { ReadOnlyMemory <char> stopword; more = ReadOnlyMemoryUtils.SplitOne(stopwords, ',', out stopword, out stopwords); stopword = ReadOnlyMemoryUtils.TrimSpaces(stopword); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopwords' specification"); warnEmpty = false; } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopwords), "stopwords is empty"); } else if (Utils.Size(loaderArgs.Stopword) > 0) { bool warnEmpty = true; foreach (string word in loaderArgs.Stopword) { var stopword = word.AsSpan(); stopword = stopword.Trim(' '); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopword' specification"); warnEmpty = false; } } } else { string srcCol = loaderArgs.StopwordsColumn; var loader = LoadStopwords(env, ch, loaderArgs.DataFile, loaderArgs.Loader, ref srcCol); int colSrc; if (!loader.Schema.TryGetColumnIndex(srcCol, out colSrc)) { throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol); } var typeSrc = loader.Schema[colSrc].Type; ch.CheckUserArg(typeSrc.IsText, nameof(Arguments.StopwordsColumn), "Must be a scalar text column"); // Accumulate the stopwords. using (var cursor = loader.GetRowCursor(col => col == colSrc)) { bool warnEmpty = true; var getter = cursor.GetGetter <ReadOnlyMemory <char> >(colSrc); while (cursor.MoveNext()) { getter(ref src); if (!src.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty rows ignored in data file"); warnEmpty = false; } } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty"); } }