internal void SaveStringOrNull(string str)
 {
     if (str == null)
     {
         Writer.Write(-1);
     }
     else
     {
         Writer.Write(Strings.Add(str).Id);
     }
 }
 public void AddWordVector(IChannel ch, string word, float[] wordVector)
 {
     ch.Assert(wordVector.Length == Dimension);
     if (_pool.Get(word) == null)
     {
         _pool.Add(word);
         WordVectors.AddRange(wordVector);
     }
 }
Beispiel #3
0
        private CustomStopWordsRemoverTransform(IHost host, ModelLoadContext ctx, IDataView input)
            : base(host, ctx, input, TestIsTextVector)
        {
            Host.AssertValue(ctx);

            using (var ch = Host.Start("Deserialization"))
            {
                // *** Binary format ***
                // <base>
                ch.AssertNonEmpty(Infos);

                const string dir       = "Stopwords";
                NormStr.Pool stopwrods = null;
                bool         res       = ctx.TryProcessSubModel(dir,
                                                                c =>
                {
                    Host.CheckValue(c, nameof(ctx));
                    c.CheckAtModel(GetStopwrodsManagerVersionInfo());

                    // *** Binary format ***
                    // int: number of stopwords
                    // int[]: stopwords string ids
                    int cstr = ctx.Reader.ReadInt32();
                    Host.CheckDecode(cstr > 0);

                    stopwrods = new NormStr.Pool();
                    for (int istr = 0; istr < cstr; istr++)
                    {
                        var nstr = stopwrods.Add(ctx.LoadString());
                        Host.CheckDecode(nstr.Id == istr);
                    }

                    // All stopwords are distinct.
                    Host.CheckDecode(stopwrods.Count == cstr);
                    // The deserialized pool should not have the empty string.
                    Host.CheckDecode(stopwrods.Get("") == null);
                });
                if (!res)
                {
                    throw Host.ExceptDecode();
                }

                _stopWordsMap = stopwrods;
                ch.Done();
            }
            Metadata.Seal();
        }
Beispiel #4
0
        private static void AddResourceIfNotPresent(Language lang)
        {
            Contracts.Assert(0 <= (int)lang & (int)lang < Utils.Size(StopWords));

            if (StopWords[(int)lang] == null)
            {
                Stream stopWordsStream = GetResourceFileStreamOrNull(lang);
                Contracts.Assert(stopWordsStream != null);
                var stopWordsList = new NormStr.Pool();
                using (StreamReader reader = new StreamReader(stopWordsStream))
                {
                    string stopWord;
                    while ((stopWord = reader.ReadLine()) != null)
                    {
                        if (!string.IsNullOrWhiteSpace(stopWord))
                        {
                            stopWordsList.Add(stopWord);
                        }
                    }
                }
                Interlocked.CompareExchange(ref StopWords[(int)lang], stopWordsList, null);
            }
        }
Beispiel #5
0
        private void LoadStopWords(IHostEnvironment env, IChannel ch, ArgumentsBase loaderArgs, out NormStr.Pool stopWordsMap)
        {
            Contracts.AssertValue(env);
            env.AssertValue(ch);
            ch.AssertValue(loaderArgs);

            if ((!string.IsNullOrEmpty(loaderArgs.Stopwords) || Utils.Size(loaderArgs.Stopword) > 0) &&
                (!string.IsNullOrWhiteSpace(loaderArgs.DataFile) || loaderArgs.Loader != null ||
                 !string.IsNullOrWhiteSpace(loaderArgs.StopwordsColumn)))
            {
                ch.Warning("Explicit stopwords list specified. Data file arguments will be ignored");
            }

            var src = default(ReadOnlyMemory <char>);

            stopWordsMap = new NormStr.Pool();
            var buffer = new StringBuilder();

            var stopwords = loaderArgs.Stopwords.AsMemory();

            stopwords = ReadOnlyMemoryUtils.TrimSpaces(stopwords);
            if (!stopwords.IsEmpty)
            {
                bool warnEmpty = true;
                for (bool more = true; more;)
                {
                    ReadOnlyMemory <char> stopword;
                    more     = ReadOnlyMemoryUtils.SplitOne(stopwords, ',', out stopword, out stopwords);
                    stopword = ReadOnlyMemoryUtils.TrimSpaces(stopword);
                    if (!stopword.IsEmpty)
                    {
                        buffer.Clear();
                        ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword.Span, buffer);
                        stopWordsMap.Add(buffer);
                    }
                    else if (warnEmpty)
                    {
                        ch.Warning("Empty strings ignored in 'stopwords' specification");
                        warnEmpty = false;
                    }
                }
                ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopwords), "stopwords is empty");
            }
            else if (Utils.Size(loaderArgs.Stopword) > 0)
            {
                bool warnEmpty = true;
                foreach (string word in loaderArgs.Stopword)
                {
                    var stopword = word.AsSpan();
                    stopword = stopword.Trim(' ');
                    if (!stopword.IsEmpty)
                    {
                        buffer.Clear();
                        ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword, buffer);
                        stopWordsMap.Add(buffer);
                    }
                    else if (warnEmpty)
                    {
                        ch.Warning("Empty strings ignored in 'stopword' specification");
                        warnEmpty = false;
                    }
                }
            }
            else
            {
                string srcCol = loaderArgs.StopwordsColumn;
                var    loader = LoadStopwords(env, ch, loaderArgs.DataFile, loaderArgs.Loader, ref srcCol);
                int    colSrc;
                if (!loader.Schema.TryGetColumnIndex(srcCol, out colSrc))
                {
                    throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol);
                }
                var typeSrc = loader.Schema[colSrc].Type;
                ch.CheckUserArg(typeSrc.IsText, nameof(Arguments.StopwordsColumn), "Must be a scalar text column");

                // Accumulate the stopwords.
                using (var cursor = loader.GetRowCursor(col => col == colSrc))
                {
                    bool warnEmpty = true;
                    var  getter    = cursor.GetGetter <ReadOnlyMemory <char> >(colSrc);
                    while (cursor.MoveNext())
                    {
                        getter(ref src);
                        if (!src.IsEmpty)
                        {
                            buffer.Clear();
                            ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Span, buffer);
                            stopWordsMap.Add(buffer);
                        }
                        else if (warnEmpty)
                        {
                            ch.Warning("Empty rows ignored in data file");
                            warnEmpty = false;
                        }
                    }
                }
                ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty");
            }
        }
 public static NormStr AddToPool(ReadOnlyMemory <char> memory, NormStr.Pool pool)
 {
     Contracts.CheckValue(pool, nameof(pool));
     return(pool.Add(memory));
 }
Beispiel #7
0
 public void AddKeyWord(string str, TokKind tid)
 {
     Contracts.AssertNonEmpty(str);
     _mpnstrtidWord.Add(_pool.Add(str), new KeyWordKind(tid, false));
 }
Beispiel #8
0
 // REVIEW: Add method to NormStr.Pool that deal with DvText instead of the other way around.
 public NormStr AddToPool(NormStr.Pool pool)
 {
     Contracts.Check(!IsNA);
     Contracts.CheckValue(pool, nameof(pool));
     return(pool.Add(_outerBuffer, _ichMin, IchLim));
 }