internal void SaveStringOrNull(string str) { if (str == null) { Writer.Write(-1); } else { Writer.Write(Strings.Add(str).Id); } }
public void AddWordVector(IChannel ch, string word, float[] wordVector) { ch.Assert(wordVector.Length == Dimension); if (_pool.Get(word) == null) { _pool.Add(word); WordVectors.AddRange(wordVector); } }
private CustomStopWordsRemoverTransform(IHost host, ModelLoadContext ctx, IDataView input) : base(host, ctx, input, TestIsTextVector) { Host.AssertValue(ctx); using (var ch = Host.Start("Deserialization")) { // *** Binary format *** // <base> ch.AssertNonEmpty(Infos); const string dir = "Stopwords"; NormStr.Pool stopwrods = null; bool res = ctx.TryProcessSubModel(dir, c => { Host.CheckValue(c, nameof(ctx)); c.CheckAtModel(GetStopwrodsManagerVersionInfo()); // *** Binary format *** // int: number of stopwords // int[]: stopwords string ids int cstr = ctx.Reader.ReadInt32(); Host.CheckDecode(cstr > 0); stopwrods = new NormStr.Pool(); for (int istr = 0; istr < cstr; istr++) { var nstr = stopwrods.Add(ctx.LoadString()); Host.CheckDecode(nstr.Id == istr); } // All stopwords are distinct. Host.CheckDecode(stopwrods.Count == cstr); // The deserialized pool should not have the empty string. Host.CheckDecode(stopwrods.Get("") == null); }); if (!res) { throw Host.ExceptDecode(); } _stopWordsMap = stopwrods; ch.Done(); } Metadata.Seal(); }
private static void AddResourceIfNotPresent(Language lang) { Contracts.Assert(0 <= (int)lang & (int)lang < Utils.Size(StopWords)); if (StopWords[(int)lang] == null) { Stream stopWordsStream = GetResourceFileStreamOrNull(lang); Contracts.Assert(stopWordsStream != null); var stopWordsList = new NormStr.Pool(); using (StreamReader reader = new StreamReader(stopWordsStream)) { string stopWord; while ((stopWord = reader.ReadLine()) != null) { if (!string.IsNullOrWhiteSpace(stopWord)) { stopWordsList.Add(stopWord); } } } Interlocked.CompareExchange(ref StopWords[(int)lang], stopWordsList, null); } }
private void LoadStopWords(IHostEnvironment env, IChannel ch, ArgumentsBase loaderArgs, out NormStr.Pool stopWordsMap) { Contracts.AssertValue(env); env.AssertValue(ch); ch.AssertValue(loaderArgs); if ((!string.IsNullOrEmpty(loaderArgs.Stopwords) || Utils.Size(loaderArgs.Stopword) > 0) && (!string.IsNullOrWhiteSpace(loaderArgs.DataFile) || loaderArgs.Loader != null || !string.IsNullOrWhiteSpace(loaderArgs.StopwordsColumn))) { ch.Warning("Explicit stopwords list specified. Data file arguments will be ignored"); } var src = default(ReadOnlyMemory <char>); stopWordsMap = new NormStr.Pool(); var buffer = new StringBuilder(); var stopwords = loaderArgs.Stopwords.AsMemory(); stopwords = ReadOnlyMemoryUtils.TrimSpaces(stopwords); if (!stopwords.IsEmpty) { bool warnEmpty = true; for (bool more = true; more;) { ReadOnlyMemory <char> stopword; more = ReadOnlyMemoryUtils.SplitOne(stopwords, ',', out stopword, out stopwords); stopword = ReadOnlyMemoryUtils.TrimSpaces(stopword); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopwords' specification"); warnEmpty = false; } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopwords), "stopwords is empty"); } else if (Utils.Size(loaderArgs.Stopword) > 0) { bool warnEmpty = true; foreach (string word in loaderArgs.Stopword) { var stopword = word.AsSpan(); stopword = stopword.Trim(' '); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopword' specification"); warnEmpty = false; } } } else { string srcCol = loaderArgs.StopwordsColumn; var loader = LoadStopwords(env, ch, loaderArgs.DataFile, loaderArgs.Loader, ref srcCol); int colSrc; if (!loader.Schema.TryGetColumnIndex(srcCol, out colSrc)) { throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol); } var typeSrc = loader.Schema[colSrc].Type; ch.CheckUserArg(typeSrc.IsText, nameof(Arguments.StopwordsColumn), "Must be a scalar text column"); // Accumulate the stopwords. using (var cursor = loader.GetRowCursor(col => col == colSrc)) { bool warnEmpty = true; var getter = cursor.GetGetter <ReadOnlyMemory <char> >(colSrc); while (cursor.MoveNext()) { getter(ref src); if (!src.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty rows ignored in data file"); warnEmpty = false; } } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty"); } }
public static NormStr AddToPool(ReadOnlyMemory <char> memory, NormStr.Pool pool) { Contracts.CheckValue(pool, nameof(pool)); return(pool.Add(memory)); }
public void AddKeyWord(string str, TokKind tid) { Contracts.AssertNonEmpty(str); _mpnstrtidWord.Add(_pool.Add(str), new KeyWordKind(tid, false)); }
// REVIEW: Add method to NormStr.Pool that deal with DvText instead of the other way around. public NormStr AddToPool(NormStr.Pool pool) { Contracts.Check(!IsNA); Contracts.CheckValue(pool, nameof(pool)); return(pool.Add(_outerBuffer, _ichMin, IchLim)); }