public void Save(ModelSaveContext ctx, bool saveText, VBuffer <ReadOnlyMemory <char> > mapping)
            {
                Contracts.AssertValue(ctx);
                long memBlockSize      = 0;
                long aliasMemBlockSize = 0;

                _ldaTrainer.GetModelStat(out memBlockSize, out aliasMemBlockSize);

                // *** Binary format ***
                // <ColInfoEx>
                // int: vocabnum
                // long: memblocksize
                // long: aliasMemBlockSize
                // (serializing term by term, for one term)
                // int: term_id, int: topic_num, KeyValuePair<int, int>[]: termTopicVector

                InfoEx.Save(ctx);
                ctx.Writer.Write(_ldaTrainer.NumVocab);
                ctx.Writer.Write(memBlockSize);
                ctx.Writer.Write(aliasMemBlockSize);

                //save model from this interface
                for (int i = 0; i < _ldaTrainer.NumVocab; i++)
                {
                    KeyValuePair <int, int>[] termTopicVector = _ldaTrainer.GetModel(i);

                    //write the topic to disk through ctx
                    ctx.Writer.Write(i); //term_id
                    ctx.Writer.Write(termTopicVector.Length);

                    foreach (KeyValuePair <int, int> p in termTopicVector)
                    {
                        ctx.Writer.Write(p.Key);
                        ctx.Writer.Write(p.Value);
                    }
                }

                var writeAction = GetTopicSummaryWriter(mapping);

                // save word-topic summary in text
                if (saveText)
                {
                    ctx.SaveTextStream(WordTopicModelFilename, writeAction);
                }
            }
Exemple #2
0
        public override void Save(ModelSaveContext ctx)
        {
            Host.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel();
            ctx.SetVersionInfo(GetVersionInfo());

            // *** Binary format ***
            // int: sizeof(Float)
            // <base>
            // for each added column
            //   ColInfoEx
            //   the ngram SequencePool
            //   the ngram inverse document frequencies

            ctx.Writer.Write(sizeof(Float));
            SaveBase(ctx);
            var ngramsNames = default(VBuffer <ReadOnlyMemory <char> >);

            for (int i = 0; i < _exes.Length; i++)
            {
                _exes[i].Save(ctx);
                _ngramMaps[i].Save(ctx.Writer);
                ctx.Writer.WriteDoubleArray(_invDocFreqs[i]);

                if (_slotNamesTypes[i] != null)
                {
                    GetSlotNames(i, ref ngramsNames);
                    Host.Assert(_ngramMaps[i].Count == ngramsNames.GetValues().Length);
                    Host.Assert(ngramsNames.IsDense);
                    ctx.SaveTextStream(string.Format("{0}-ngrams.txt", Infos[i].Name),
                                       writer =>
                    {
                        var explicitNgramNames = ngramsNames.GetValues();
                        writer.WriteLine("# Number of Ngrams terms = {0}", explicitNgramNames.Length);
                        for (int j = 0; j < explicitNgramNames.Length; j++)
                        {
                            writer.WriteLine("{0}\t{1}", j, explicitNgramNames[j]);
                        }
                    });
                }
            }
        }
Exemple #3
0
        public override void Save(ModelSaveContext ctx)
        {
            Host.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel();
            ctx.SetVersionInfo(GetVersionInfo());

            // *** Binary format ***
            // <base>
            SaveBase(ctx);

            const string dir = "Stopwords";

            ctx.SaveSubModel(dir,
                             c =>
            {
                Host.CheckValue(c, nameof(ctx));
                c.CheckAtModel();
                c.SetVersionInfo(GetStopwrodsManagerVersionInfo());

                // *** Binary format ***
                // int: number of stopwords
                // int[]: stopwords string ids
                Host.Assert(_stopWordsMap.Count > 0);
                ctx.Writer.Write(_stopWordsMap.Count);
                int id = 0;
                foreach (var nstr in _stopWordsMap)
                {
                    Host.Assert(nstr.Id == id);
                    ctx.SaveString(nstr.Value);
                    id++;
                }

                ctx.SaveTextStream("Stopwords.txt", writer =>
                {
                    foreach (var nstr in _stopWordsMap)
                    {
                        writer.WriteLine("{0}\t{1}", nstr.Id, nstr.Value);
                    }
                });
            });
        }
        private static void Save(IChannel ch, ModelSaveContext ctx, CodecFactory factory, ref VBuffer <ReadOnlyMemory <char> > values)
        {
            Contracts.AssertValue(ch);
            ch.CheckValue(ctx, nameof(ctx));
            ctx.CheckAtModel();
            ctx.SetVersionInfo(GetVersionInfo());

            // *** Binary format ***
            // Codec parameterization: A codec parameterization that should be a ReadOnlyMemory codec
            // int: n, the number of bytes used to write the values
            // byte[n]: As encoded using the codec

            // Get the codec from the factory
            IValueCodec codec;
            var         result = factory.TryGetCodec(new VectorType(TextType.Instance), out codec);

            ch.Assert(result);
            ch.Assert(codec.Type.IsVector);
            ch.Assert(codec.Type.VectorSize == 0);
            ch.Assert(codec.Type.ItemType.RawType == typeof(ReadOnlyMemory <char>));
            IValueCodec <VBuffer <ReadOnlyMemory <char> > > textCodec = (IValueCodec <VBuffer <ReadOnlyMemory <char> > >)codec;

            factory.WriteCodec(ctx.Writer.BaseStream, codec);
            using (var mem = new MemoryStream())
            {
                using (var writer = textCodec.OpenWriter(mem))
                {
                    writer.Write(ref values);
                    writer.Commit();
                }
                ctx.Writer.WriteByteArray(mem.ToArray());
            }

            // Make this resemble, more or less, the auxiliary output from the TermTransform.
            // It will differ somewhat due to the vector being possibly sparse. To distinguish
            // between missing and empty, empties are not written at all, while missings are.
            var v = values;

            char[] buffer = null;
            ctx.SaveTextStream("Terms.txt",
                               writer =>
            {
                writer.WriteLine("# Number of terms = {0} of length {1}", v.Count, v.Length);
                foreach (var pair in v.Items())
                {
                    var text = pair.Value;
                    if (text.IsEmpty)
                    {
                        continue;
                    }
                    writer.Write("{0}\t", pair.Key);
                    // REVIEW: What about escaping this, *especially* for linebreaks?
                    // Do C# and .NET really have no equivalent to Python's "repr"? :(
                    if (text.IsEmpty)
                    {
                        writer.WriteLine();
                        continue;
                    }
                    Utils.EnsureSize(ref buffer, text.Length);

                    var span = text.Span;
                    for (int i = 0; i < text.Length; i++)
                    {
                        buffer[i] = span[i];
                    }

                    writer.WriteLine(buffer, 0, text.Length);
                }
            });
        }