internal void InitializePretrained(LDAModel model) { _ldaTrainer.AllocateModelMemory(model.VocabularyBuckets, model.NumberOfTopics, model.MemBlockSize, model.AliasMemBlockSize); Debug.Assert(model.VocabularyBuckets == model.LDA_Data.Length); for (int termID = 0; termID < model.VocabularyBuckets; termID++) { var kvs = model.LDA_Data[termID]; var topicId = kvs.Select(kv => kv.Key).ToArray(); var topicProb = kvs.Select(kv => kv.Value).ToArray(); var termTopicNum = topicId.Length; _ldaTrainer.SetModel(termID, topicId, topicProb, termTopicNum); } //do the preparation if (!_predictionPreparationDone) { lock (_preparationSyncRoot) { _ldaTrainer.InitializeBeforeTest(); _predictionPreparationDone = true; } } }
public LdaState(IExceptionContext ectx, ModelLoadContext ctx) : this() { ectx.AssertValue(ctx); // *** Binary format *** // <ColInfoEx> // int: vocabnum // long: memblocksize // long: aliasMemBlockSize // (serializing term by term, for one term) // int: term_id, int: topic_num, KeyValuePair<int, int>[]: termTopicVector InfoEx = new ColInfoEx(ectx, ctx); _numVocab = ctx.Reader.ReadInt32(); ectx.CheckDecode(_numVocab > 0); long memBlockSize = ctx.Reader.ReadInt64(); ectx.CheckDecode(memBlockSize > 0); long aliasMemBlockSize = ctx.Reader.ReadInt64(); ectx.CheckDecode(aliasMemBlockSize > 0); _ldaTrainer = new LdaSingleBox( InfoEx.NumTopic, _numVocab, /* Need to set number of vocabulary here */ InfoEx.AlphaSum, InfoEx.Beta, InfoEx.NumIter, InfoEx.LikelihoodInterval, InfoEx.NumThread, InfoEx.MHStep, InfoEx.NumSummaryTermPerTopic, false, InfoEx.NumMaxDocToken); _ldaTrainer.AllocateModelMemory(_numVocab, InfoEx.NumTopic, memBlockSize, aliasMemBlockSize); for (int i = 0; i < _numVocab; i++) { int termID = ctx.Reader.ReadInt32(); ectx.CheckDecode(termID >= 0); int termTopicNum = ctx.Reader.ReadInt32(); ectx.CheckDecode(termTopicNum >= 0); int[] topicId = new int[termTopicNum]; int[] topicProb = new int[termTopicNum]; for (int j = 0; j < termTopicNum; j++) { topicId[j] = ctx.Reader.ReadInt32(); topicProb[j] = ctx.Reader.ReadInt32(); } //set the topic into _ldaTrainer inner topic table _ldaTrainer.SetModel(termID, topicId, topicProb, termTopicNum); } //do the preparation if (!_predictionPreparationDone) { _ldaTrainer.InitializeBeforeTest(); _predictionPreparationDone = true; } }
public void Output(ref VBuffer <Double> src, ref VBuffer <Float> dst, int numBurninIter, bool reset) { // Prediction for a single document. // LdaSingleBox.InitializeBeforeTest() is NOT thread-safe. if (!_predictionPreparationDone) { lock (_preparationSyncRoot) { if (!_predictionPreparationDone) { //do some preparation for building tables in native c++ _ldaTrainer.InitializeBeforeTest(); _predictionPreparationDone = true; } } } int len = InfoEx.NumTopic; var values = dst.Values; var indices = dst.Indices; if (src.Count == 0) { dst = new VBuffer <Float>(len, 0, values, indices); return; } // Make sure all the frequencies are valid and truncate if the sum gets too large. int docSize = 0; int termNum = 0; for (int i = 0; i < src.Count; i++) { int termFreq = GetFrequency(src.Values[i]); if (termFreq < 0) { // REVIEW: Should this log a warning message? And what should it produce? // It currently produces a vbuffer of all NA values. // REVIEW: Need a utility method to do this... if (Utils.Size(values) < len) { values = new Float[len]; } for (int k = 0; k < len; k++) { values[k] = Float.NaN; } dst = new VBuffer <Float>(len, values, indices); return; } if (docSize >= InfoEx.NumMaxDocToken - termFreq) { break; } docSize += termFreq; termNum++; } // REVIEW: Too much memory allocation here on each prediction. List <KeyValuePair <int, float> > retTopics; if (src.IsDense) { retTopics = _ldaTrainer.TestDocDense(src.Values, termNum, numBurninIter, reset); } else { retTopics = _ldaTrainer.TestDoc(src.Indices.Take(src.Count).ToArray(), src.Values.Take(src.Count).ToArray(), termNum, numBurninIter, reset); } int count = retTopics.Count; Contracts.Assert(count <= len); if (Utils.Size(values) < count) { values = new Float[count]; } if (count < len && Utils.Size(indices) < count) { indices = new int[count]; } double normalizer = 0; for (int i = 0; i < count; i++) { int index = retTopics[i].Key; Float value = retTopics[i].Value; Contracts.Assert(value >= 0); Contracts.Assert(0 <= index && index < len); if (count < len) { Contracts.Assert(i == 0 || indices[i - 1] < index); indices[i] = index; } else { Contracts.Assert(index == i); } values[i] = value; normalizer += value; } if (normalizer > 0) { for (int i = 0; i < count; i++) { values[i] = (Float)(values[i] / normalizer); } } dst = new VBuffer <Float>(len, count, values, indices); }