public static TX Lower(TX a)
        {
            if (a.IsEmpty)
            {
                return(a);
            }
            var sb = new StringBuilder();

            ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(a.Span, sb);
            return(sb.ToString().AsMemory());
        }
Beispiel #2
0
        protected override Delegate GetGetterCore(IChannel ch, IRow input, int iinfo, out Action disposer)
        {
            Host.AssertValueOrNull(ch);
            Host.AssertValue(input);
            Host.Assert(0 <= iinfo && iinfo < Infos.Length);
            Host.Assert(Infos[iinfo].TypeSrc.IsVector & Infos[iinfo].TypeSrc.ItemType.IsText);
            disposer = null;

            var      ex            = _exes[iinfo];
            Language stopWordslang = ex.Lang;
            var      lang          = default(ReadOnlyMemory <char>);
            var      getLang       = ex.LangsColIndex >= 0 ? input.GetGetter <ReadOnlyMemory <char> >(ex.LangsColIndex) : null;

            var getSrc = GetSrcGetter <VBuffer <ReadOnlyMemory <char> > >(input, iinfo);
            var src    = default(VBuffer <ReadOnlyMemory <char> >);
            var buffer = new StringBuilder();
            var list   = new List <ReadOnlyMemory <char> >();

            ValueGetter <VBuffer <ReadOnlyMemory <char> > > del =
                (ref VBuffer <ReadOnlyMemory <char> > dst) =>
            {
                var langToUse = stopWordslang;
                UpdateLanguage(ref langToUse, getLang, ref lang);

                getSrc(ref src);
                list.Clear();

                var srcValues = src.GetValues();
                for (int i = 0; i < srcValues.Length; i++)
                {
                    if (srcValues[i].IsEmpty)
                    {
                        continue;
                    }
                    buffer.Clear();
                    ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(srcValues[i].Span, buffer);

                    // REVIEW nihejazi: Consider using a trie for string matching (Aho-Corasick, etc.)
                    if (StopWords[(int)langToUse].Get(buffer) == null)
                    {
                        list.Add(srcValues[i]);
                    }
                }

                VBufferUtils.Copy(list, ref dst, list.Count);
            };

            return(del);
        }
Beispiel #3
0
        private void LoadStopWords(IHostEnvironment env, IChannel ch, ArgumentsBase loaderArgs, out NormStr.Pool stopWordsMap)
        {
            Contracts.AssertValue(env);
            env.AssertValue(ch);
            ch.AssertValue(loaderArgs);

            if ((!string.IsNullOrEmpty(loaderArgs.Stopwords) || Utils.Size(loaderArgs.Stopword) > 0) &&
                (!string.IsNullOrWhiteSpace(loaderArgs.DataFile) || loaderArgs.Loader != null ||
                 !string.IsNullOrWhiteSpace(loaderArgs.StopwordsColumn)))
            {
                ch.Warning("Explicit stopwords list specified. Data file arguments will be ignored");
            }

            var src = default(ReadOnlyMemory <char>);

            stopWordsMap = new NormStr.Pool();
            var buffer = new StringBuilder();

            var stopwords = loaderArgs.Stopwords.AsMemory();

            stopwords = ReadOnlyMemoryUtils.TrimSpaces(stopwords);
            if (!stopwords.IsEmpty)
            {
                bool warnEmpty = true;
                for (bool more = true; more;)
                {
                    ReadOnlyMemory <char> stopword;
                    more     = ReadOnlyMemoryUtils.SplitOne(stopwords, ',', out stopword, out stopwords);
                    stopword = ReadOnlyMemoryUtils.TrimSpaces(stopword);
                    if (!stopword.IsEmpty)
                    {
                        buffer.Clear();
                        ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword.Span, buffer);
                        stopWordsMap.Add(buffer);
                    }
                    else if (warnEmpty)
                    {
                        ch.Warning("Empty strings ignored in 'stopwords' specification");
                        warnEmpty = false;
                    }
                }
                ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopwords), "stopwords is empty");
            }
            else if (Utils.Size(loaderArgs.Stopword) > 0)
            {
                bool warnEmpty = true;
                foreach (string word in loaderArgs.Stopword)
                {
                    var stopword = word.AsSpan();
                    stopword = stopword.Trim(' ');
                    if (!stopword.IsEmpty)
                    {
                        buffer.Clear();
                        ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword, buffer);
                        stopWordsMap.Add(buffer);
                    }
                    else if (warnEmpty)
                    {
                        ch.Warning("Empty strings ignored in 'stopword' specification");
                        warnEmpty = false;
                    }
                }
            }
            else
            {
                string srcCol = loaderArgs.StopwordsColumn;
                var    loader = LoadStopwords(env, ch, loaderArgs.DataFile, loaderArgs.Loader, ref srcCol);
                int    colSrc;
                if (!loader.Schema.TryGetColumnIndex(srcCol, out colSrc))
                {
                    throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol);
                }
                var typeSrc = loader.Schema[colSrc].Type;
                ch.CheckUserArg(typeSrc.IsText, nameof(Arguments.StopwordsColumn), "Must be a scalar text column");

                // Accumulate the stopwords.
                using (var cursor = loader.GetRowCursor(col => col == colSrc))
                {
                    bool warnEmpty = true;
                    var  getter    = cursor.GetGetter <ReadOnlyMemory <char> >(colSrc);
                    while (cursor.MoveNext())
                    {
                        getter(ref src);
                        if (!src.IsEmpty)
                        {
                            buffer.Clear();
                            ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Span, buffer);
                            stopWordsMap.Add(buffer);
                        }
                        else if (warnEmpty)
                        {
                            ch.Warning("Empty rows ignored in data file");
                            warnEmpty = false;
                        }
                    }
                }
                ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty");
            }
        }