/// <summary> /// The constructor. Caller must provide the name pool and key word table. /// </summary> public Lexer(NormStr.Pool pool, KeyWordTable kwt) { Contracts.AssertValue(pool); Contracts.AssertValue(kwt); _pool = pool; _kwt = kwt; }
private LambdaParser() { _pool = new NormStr.Pool(); _kwt = new KeyWordTable(_pool); InitKeyWordTable(); _lex = new Lexer(_pool, _kwt); }
/// <summary> /// Create a <see cref="ModelSaveContext"/> supporting saving to a repository, for implementors of <see cref="ICanSaveModel"/>. /// </summary> internal ModelSaveContext(RepositoryWriter rep, string dir, string name) { Contracts.CheckValue(rep, nameof(rep)); Repository = rep; _ectx = rep.ExceptionContext; _ectx.CheckValueOrNull(dir); _ectx.CheckNonEmpty(name, nameof(name)); Directory = dir; Strings = new NormStr.Pool(); _ent = rep.CreateEntry(dir, name); try { Writer = new BinaryWriter(_ent.Stream, Encoding.UTF8, leaveOpen: true); try { ModelHeader.BeginWrite(Writer, out FpMin, out Header); } catch { Writer.Dispose(); throw; } } catch { _ent.Dispose(); throw; } }
public KeyWordTable(NormStr.Pool pool) { Contracts.AssertValue(pool); _pool = pool; _mpnstrtidWord = new Dictionary <NormStr, KeyWordKind>(); _mpnstrtidPunc = new Dictionary <NormStr, TokKind>(); }
public NormStr FindInPool(NormStr.Pool pool) { Contracts.CheckValue(pool, nameof(pool)); if (IsNA) { return(null); } return(pool.Get(_outerBuffer, _ichMin, IchLim)); }
/// <summary> /// Create a <see cref="ModelSaveContext"/> supporting saving to a single-stream, for implementors of <see cref="ICanSaveInBinaryFormat"/>. /// </summary> internal ModelSaveContext(BinaryWriter writer, IExceptionContext ectx = null) { Contracts.AssertValueOrNull(ectx); _ectx = ectx; _ectx.CheckValue(writer, nameof(writer)); Repository = null; Directory = null; _ent = null; Strings = new NormStr.Pool(); Writer = writer; ModelHeader.BeginWrite(Writer, out FpMin, out Header); }
private CustomStopWordsRemoverTransform(IHost host, ModelLoadContext ctx, IDataView input) : base(host, ctx, input, TestIsTextVector) { Host.AssertValue(ctx); using (var ch = Host.Start("Deserialization")) { // *** Binary format *** // <base> ch.AssertNonEmpty(Infos); const string dir = "Stopwords"; NormStr.Pool stopwrods = null; bool res = ctx.TryProcessSubModel(dir, c => { Host.CheckValue(c, nameof(ctx)); c.CheckAtModel(GetStopwrodsManagerVersionInfo()); // *** Binary format *** // int: number of stopwords // int[]: stopwords string ids int cstr = ctx.Reader.ReadInt32(); Host.CheckDecode(cstr > 0); stopwrods = new NormStr.Pool(); for (int istr = 0; istr < cstr; istr++) { var nstr = stopwrods.Add(ctx.LoadString()); Host.CheckDecode(nstr.Id == istr); } // All stopwords are distinct. Host.CheckDecode(stopwrods.Count == cstr); // The deserialized pool should not have the empty string. Host.CheckDecode(stopwrods.Get("") == null); }); if (!res) { throw Host.ExceptDecode(); } _stopWordsMap = stopwrods; ch.Done(); } Metadata.Seal(); }
private static void AddResourceIfNotPresent(Language lang) { Contracts.Assert(0 <= (int)lang & (int)lang < Utils.Size(StopWords)); if (StopWords[(int)lang] == null) { Stream stopWordsStream = GetResourceFileStreamOrNull(lang); Contracts.Assert(stopWordsStream != null); var stopWordsList = new NormStr.Pool(); using (StreamReader reader = new StreamReader(stopWordsStream)) { string stopWord; while ((stopWord = reader.ReadLine()) != null) { if (!string.IsNullOrWhiteSpace(stopWord)) { stopWordsList.Add(stopWord); } } } Interlocked.CompareExchange(ref StopWords[(int)lang], stopWordsList, null); } }
public Model(int dimension) { Dimension = dimension; WordVectors = new BigArray <float>(); _pool = new NormStr.Pool(); }
private void LoadStopWords(IHostEnvironment env, IChannel ch, ArgumentsBase loaderArgs, out NormStr.Pool stopWordsMap) { Contracts.AssertValue(env); env.AssertValue(ch); ch.AssertValue(loaderArgs); if ((!string.IsNullOrEmpty(loaderArgs.Stopwords) || Utils.Size(loaderArgs.Stopword) > 0) && (!string.IsNullOrWhiteSpace(loaderArgs.DataFile) || loaderArgs.Loader != null || !string.IsNullOrWhiteSpace(loaderArgs.StopwordsColumn))) { ch.Warning("Explicit stopwords list specified. Data file arguments will be ignored"); } var src = default(ReadOnlyMemory <char>); stopWordsMap = new NormStr.Pool(); var buffer = new StringBuilder(); var stopwords = loaderArgs.Stopwords.AsMemory(); stopwords = ReadOnlyMemoryUtils.TrimSpaces(stopwords); if (!stopwords.IsEmpty) { bool warnEmpty = true; for (bool more = true; more;) { ReadOnlyMemory <char> stopword; more = ReadOnlyMemoryUtils.SplitOne(stopwords, ',', out stopword, out stopwords); stopword = ReadOnlyMemoryUtils.TrimSpaces(stopword); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopwords' specification"); warnEmpty = false; } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopwords), "stopwords is empty"); } else if (Utils.Size(loaderArgs.Stopword) > 0) { bool warnEmpty = true; foreach (string word in loaderArgs.Stopword) { var stopword = word.AsSpan(); stopword = stopword.Trim(' '); if (!stopword.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty strings ignored in 'stopword' specification"); warnEmpty = false; } } } else { string srcCol = loaderArgs.StopwordsColumn; var loader = LoadStopwords(env, ch, loaderArgs.DataFile, loaderArgs.Loader, ref srcCol); int colSrc; if (!loader.Schema.TryGetColumnIndex(srcCol, out colSrc)) { throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol); } var typeSrc = loader.Schema[colSrc].Type; ch.CheckUserArg(typeSrc.IsText, nameof(Arguments.StopwordsColumn), "Must be a scalar text column"); // Accumulate the stopwords. using (var cursor = loader.GetRowCursor(col => col == colSrc)) { bool warnEmpty = true; var getter = cursor.GetGetter <ReadOnlyMemory <char> >(colSrc); while (cursor.MoveNext()) { getter(ref src); if (!src.IsEmpty) { buffer.Clear(); ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Span, buffer); stopWordsMap.Add(buffer); } else if (warnEmpty) { ch.Warning("Empty rows ignored in data file"); warnEmpty = false; } } } ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty"); } }
public static NormStr FindInPool(ReadOnlyMemory <char> memory, NormStr.Pool pool) { Contracts.CheckValue(pool, nameof(pool)); return(pool.Get(memory)); }
/// <summary> /// The current writer position should be the end of the model blob. Records the model size, writes the string table, /// completes and writes the header, and writes the tail. /// </summary> public static void EndWrite(BinaryWriter writer, long fpMin, ref ModelHeader header, NormStr.Pool pool = null, string loaderAssemblyName = null) { Contracts.CheckValue(writer, nameof(writer)); Contracts.CheckParam(fpMin >= 0, nameof(fpMin)); Contracts.CheckValueOrNull(pool); // Record the model size. EndModelCore(writer, fpMin, ref header); Contracts.Check(header.FpStringTable == 0); Contracts.Check(header.CbStringTable == 0); Contracts.Check(header.FpStringChars == 0); Contracts.Check(header.CbStringChars == 0); // Write the strings. if (pool != null && pool.Count > 0) { header.FpStringTable = writer.FpCur() - fpMin; long offset = 0; int cv = 0; // REVIEW: Implement an indexer on pool! foreach (var ns in pool) { Contracts.Assert(ns.Id == cv); offset += ns.Value.Length * sizeof(char); writer.Write(offset); cv++; } Contracts.Assert(cv == pool.Count); header.CbStringTable = pool.Count * sizeof(long); header.FpStringChars = writer.FpCur() - fpMin; Contracts.Assert(header.FpStringChars == header.FpStringTable + header.CbStringTable); foreach (var ns in pool) { foreach (var ch in ns.Value.Span) { writer.Write((short)ch); } } header.CbStringChars = writer.FpCur() - header.FpStringChars - fpMin; Contracts.Assert(offset == header.CbStringChars); } WriteLoaderAssemblyName(writer, fpMin, ref header, loaderAssemblyName); WriteHeaderAndTailCore(writer, fpMin, ref header); }
// REVIEW: Add method to NormStr.Pool that deal with DvText instead of the other way around. public NormStr AddToPool(NormStr.Pool pool) { Contracts.Check(!IsNA); Contracts.CheckValue(pool, nameof(pool)); return(pool.Add(_outerBuffer, _ichMin, IchLim)); }