예제 #1
0
 /// <summary>
 /// The constructor. Caller must provide the name pool and key word table.
 /// </summary>
 public Lexer(NormStr.Pool pool, KeyWordTable kwt)
 {
     Contracts.AssertValue(pool);
     Contracts.AssertValue(kwt);
     _pool = pool;
     _kwt  = kwt;
 }
예제 #2
0
 private LambdaParser()
 {
     _pool = new NormStr.Pool();
     _kwt  = new KeyWordTable(_pool);
     InitKeyWordTable();
     _lex = new Lexer(_pool, _kwt);
 }
예제 #3
0
        /// <summary>
        /// Create a <see cref="ModelSaveContext"/> supporting saving to a repository, for implementors of <see cref="ICanSaveModel"/>.
        /// </summary>
        internal ModelSaveContext(RepositoryWriter rep, string dir, string name)
        {
            Contracts.CheckValue(rep, nameof(rep));
            Repository = rep;
            _ectx      = rep.ExceptionContext;

            _ectx.CheckValueOrNull(dir);
            _ectx.CheckNonEmpty(name, nameof(name));

            Directory = dir;
            Strings   = new NormStr.Pool();

            _ent = rep.CreateEntry(dir, name);
            try
            {
                Writer = new BinaryWriter(_ent.Stream, Encoding.UTF8, leaveOpen: true);
                try
                {
                    ModelHeader.BeginWrite(Writer, out FpMin, out Header);
                }
                catch
                {
                    Writer.Dispose();
                    throw;
                }
            }
            catch
            {
                _ent.Dispose();
                throw;
            }
        }
예제 #4
0
        public KeyWordTable(NormStr.Pool pool)
        {
            Contracts.AssertValue(pool);

            _pool          = pool;
            _mpnstrtidWord = new Dictionary <NormStr, KeyWordKind>();
            _mpnstrtidPunc = new Dictionary <NormStr, TokKind>();
        }
예제 #5
0
 public NormStr FindInPool(NormStr.Pool pool)
 {
     Contracts.CheckValue(pool, nameof(pool));
     if (IsNA)
     {
         return(null);
     }
     return(pool.Get(_outerBuffer, _ichMin, IchLim));
 }
예제 #6
0
        /// <summary>
        /// Create a <see cref="ModelSaveContext"/> supporting saving to a single-stream, for implementors of <see cref="ICanSaveInBinaryFormat"/>.
        /// </summary>
        internal ModelSaveContext(BinaryWriter writer, IExceptionContext ectx = null)
        {
            Contracts.AssertValueOrNull(ectx);
            _ectx = ectx;
            _ectx.CheckValue(writer, nameof(writer));

            Repository = null;
            Directory  = null;
            _ent       = null;

            Strings = new NormStr.Pool();
            Writer  = writer;
            ModelHeader.BeginWrite(Writer, out FpMin, out Header);
        }
예제 #7
0
        private CustomStopWordsRemoverTransform(IHost host, ModelLoadContext ctx, IDataView input)
            : base(host, ctx, input, TestIsTextVector)
        {
            Host.AssertValue(ctx);

            using (var ch = Host.Start("Deserialization"))
            {
                // *** Binary format ***
                // <base>
                ch.AssertNonEmpty(Infos);

                const string dir       = "Stopwords";
                NormStr.Pool stopwrods = null;
                bool         res       = ctx.TryProcessSubModel(dir,
                                                                c =>
                {
                    Host.CheckValue(c, nameof(ctx));
                    c.CheckAtModel(GetStopwrodsManagerVersionInfo());

                    // *** Binary format ***
                    // int: number of stopwords
                    // int[]: stopwords string ids
                    int cstr = ctx.Reader.ReadInt32();
                    Host.CheckDecode(cstr > 0);

                    stopwrods = new NormStr.Pool();
                    for (int istr = 0; istr < cstr; istr++)
                    {
                        var nstr = stopwrods.Add(ctx.LoadString());
                        Host.CheckDecode(nstr.Id == istr);
                    }

                    // All stopwords are distinct.
                    Host.CheckDecode(stopwrods.Count == cstr);
                    // The deserialized pool should not have the empty string.
                    Host.CheckDecode(stopwrods.Get("") == null);
                });
                if (!res)
                {
                    throw Host.ExceptDecode();
                }

                _stopWordsMap = stopwrods;
                ch.Done();
            }
            Metadata.Seal();
        }
예제 #8
0
        private static void AddResourceIfNotPresent(Language lang)
        {
            Contracts.Assert(0 <= (int)lang & (int)lang < Utils.Size(StopWords));

            if (StopWords[(int)lang] == null)
            {
                Stream stopWordsStream = GetResourceFileStreamOrNull(lang);
                Contracts.Assert(stopWordsStream != null);
                var stopWordsList = new NormStr.Pool();
                using (StreamReader reader = new StreamReader(stopWordsStream))
                {
                    string stopWord;
                    while ((stopWord = reader.ReadLine()) != null)
                    {
                        if (!string.IsNullOrWhiteSpace(stopWord))
                        {
                            stopWordsList.Add(stopWord);
                        }
                    }
                }
                Interlocked.CompareExchange(ref StopWords[(int)lang], stopWordsList, null);
            }
        }
 public Model(int dimension)
 {
     Dimension   = dimension;
     WordVectors = new BigArray <float>();
     _pool       = new NormStr.Pool();
 }
예제 #10
0
        private void LoadStopWords(IHostEnvironment env, IChannel ch, ArgumentsBase loaderArgs, out NormStr.Pool stopWordsMap)
        {
            Contracts.AssertValue(env);
            env.AssertValue(ch);
            ch.AssertValue(loaderArgs);

            if ((!string.IsNullOrEmpty(loaderArgs.Stopwords) || Utils.Size(loaderArgs.Stopword) > 0) &&
                (!string.IsNullOrWhiteSpace(loaderArgs.DataFile) || loaderArgs.Loader != null ||
                 !string.IsNullOrWhiteSpace(loaderArgs.StopwordsColumn)))
            {
                ch.Warning("Explicit stopwords list specified. Data file arguments will be ignored");
            }

            var src = default(ReadOnlyMemory <char>);

            stopWordsMap = new NormStr.Pool();
            var buffer = new StringBuilder();

            var stopwords = loaderArgs.Stopwords.AsMemory();

            stopwords = ReadOnlyMemoryUtils.TrimSpaces(stopwords);
            if (!stopwords.IsEmpty)
            {
                bool warnEmpty = true;
                for (bool more = true; more;)
                {
                    ReadOnlyMemory <char> stopword;
                    more     = ReadOnlyMemoryUtils.SplitOne(stopwords, ',', out stopword, out stopwords);
                    stopword = ReadOnlyMemoryUtils.TrimSpaces(stopword);
                    if (!stopword.IsEmpty)
                    {
                        buffer.Clear();
                        ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword.Span, buffer);
                        stopWordsMap.Add(buffer);
                    }
                    else if (warnEmpty)
                    {
                        ch.Warning("Empty strings ignored in 'stopwords' specification");
                        warnEmpty = false;
                    }
                }
                ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopwords), "stopwords is empty");
            }
            else if (Utils.Size(loaderArgs.Stopword) > 0)
            {
                bool warnEmpty = true;
                foreach (string word in loaderArgs.Stopword)
                {
                    var stopword = word.AsSpan();
                    stopword = stopword.Trim(' ');
                    if (!stopword.IsEmpty)
                    {
                        buffer.Clear();
                        ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(stopword, buffer);
                        stopWordsMap.Add(buffer);
                    }
                    else if (warnEmpty)
                    {
                        ch.Warning("Empty strings ignored in 'stopword' specification");
                        warnEmpty = false;
                    }
                }
            }
            else
            {
                string srcCol = loaderArgs.StopwordsColumn;
                var    loader = LoadStopwords(env, ch, loaderArgs.DataFile, loaderArgs.Loader, ref srcCol);
                int    colSrc;
                if (!loader.Schema.TryGetColumnIndex(srcCol, out colSrc))
                {
                    throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol);
                }
                var typeSrc = loader.Schema[colSrc].Type;
                ch.CheckUserArg(typeSrc.IsText, nameof(Arguments.StopwordsColumn), "Must be a scalar text column");

                // Accumulate the stopwords.
                using (var cursor = loader.GetRowCursor(col => col == colSrc))
                {
                    bool warnEmpty = true;
                    var  getter    = cursor.GetGetter <ReadOnlyMemory <char> >(colSrc);
                    while (cursor.MoveNext())
                    {
                        getter(ref src);
                        if (!src.IsEmpty)
                        {
                            buffer.Clear();
                            ReadOnlyMemoryUtils.AddLowerCaseToStringBuilder(src.Span, buffer);
                            stopWordsMap.Add(buffer);
                        }
                        else if (warnEmpty)
                        {
                            ch.Warning("Empty rows ignored in data file");
                            warnEmpty = false;
                        }
                    }
                }
                ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty");
            }
        }
 public static NormStr FindInPool(ReadOnlyMemory <char> memory, NormStr.Pool pool)
 {
     Contracts.CheckValue(pool, nameof(pool));
     return(pool.Get(memory));
 }
        /// <summary>
        /// The current writer position should be the end of the model blob. Records the model size, writes the string table,
        /// completes and writes the header, and writes the tail.
        /// </summary>
        public static void EndWrite(BinaryWriter writer, long fpMin, ref ModelHeader header, NormStr.Pool pool = null, string loaderAssemblyName = null)
        {
            Contracts.CheckValue(writer, nameof(writer));
            Contracts.CheckParam(fpMin >= 0, nameof(fpMin));
            Contracts.CheckValueOrNull(pool);

            // Record the model size.
            EndModelCore(writer, fpMin, ref header);

            Contracts.Check(header.FpStringTable == 0);
            Contracts.Check(header.CbStringTable == 0);
            Contracts.Check(header.FpStringChars == 0);
            Contracts.Check(header.CbStringChars == 0);

            // Write the strings.
            if (pool != null && pool.Count > 0)
            {
                header.FpStringTable = writer.FpCur() - fpMin;
                long offset = 0;
                int  cv     = 0;
                // REVIEW: Implement an indexer on pool!
                foreach (var ns in pool)
                {
                    Contracts.Assert(ns.Id == cv);
                    offset += ns.Value.Length * sizeof(char);
                    writer.Write(offset);
                    cv++;
                }
                Contracts.Assert(cv == pool.Count);
                header.CbStringTable = pool.Count * sizeof(long);
                header.FpStringChars = writer.FpCur() - fpMin;
                Contracts.Assert(header.FpStringChars == header.FpStringTable + header.CbStringTable);
                foreach (var ns in pool)
                {
                    foreach (var ch in ns.Value.Span)
                    {
                        writer.Write((short)ch);
                    }
                }
                header.CbStringChars = writer.FpCur() - header.FpStringChars - fpMin;
                Contracts.Assert(offset == header.CbStringChars);
            }

            WriteLoaderAssemblyName(writer, fpMin, ref header, loaderAssemblyName);

            WriteHeaderAndTailCore(writer, fpMin, ref header);
        }
예제 #13
0
 // REVIEW: Add method to NormStr.Pool that deal with DvText instead of the other way around.
 public NormStr AddToPool(NormStr.Pool pool)
 {
     Contracts.Check(!IsNA);
     Contracts.CheckValue(pool, nameof(pool));
     return(pool.Add(_outerBuffer, _ichMin, IchLim));
 }