private bool EnsureCorpusVocabulary() { if (this.disposed) { throw new ObjectDisposedException("LDA Object has already been disposed."); } if (this.corpusVocabulary == null) { if (File.Exists(this.corpusVocabularyFileName)) { StatusMessage.Write("LDA.GetTopicAllocations: Loading Corpus Vocabulary"); this.corpusVocabulary = CorpusVocabulary.NewInstance(File.ReadLines(this.corpusVocabularyFileName)); return(true); } else { StatusMessage.Write(string.Format("LDA.GetTopicAllocations: Error. Cannot find Corpus Vocabulary {0}", this.corpusVocabularyFileName)); } return(false); } return(true); }
/// <summary> /// Initializes a new instance of the <see cref="LDA"/> class. /// </summary> /// <param name="numTopic"></param> /// <param name="modelFileName"></param> /// <param name="corpusVocabularyFileName"></param> /// <param name="language"></param> private LDA(int numTopic, string modelFileName, string corpusVocabularyFileName, int[] badTopicIds, Language language) { this.numTopics = numTopic; this.dvFactory = DocumentVocabularyFactory.NewInstance(language); this.topicAllocationsFactory = VectorFactory.NewInstance(VectorType.DenseVector, numTopic); this.corpusVocabularyFileName = corpusVocabularyFileName; StatusMessage.Write(string.Format("LDA: Initializing Vowpal Wabbit Interface with model file {0}", modelFileName)); this.vwLDAModel = VowpalWabbitInterface.Initialize(string.Format("-i {0} -t --quiet", modelFileName)); //this.vwLDAModel = new VowpalWabbit(string.Format(CultureInfo.InvariantCulture, "-i {0} -t --quiet", modelFileName)); this.badTopics = new bool[this.numTopics]; Array.Clear(this.badTopics, 0, this.numTopics); this.RecommendedCompressionType = VectorType.DenseVector; // Default int badTopicCount; if ((badTopicIds != null) && ((badTopicCount = badTopicIds.Length) > 0)) { foreach (var topicId in badTopicIds) { if (topicId < this.numTopics) { this.badTopics[topicId] = true; } } var sizeOfSparseVector = VectorBase.SizeOfSparseVectors(numTopic, badTopicCount); var sizeOfDenseVector = VectorBase.BytesPerDimension * numTopic; this.RecommendedCompressionType = (sizeOfSparseVector < sizeOfDenseVector) ? VectorType.SparseVector : VectorType.DenseVector; } }
/// <summary> /// Adds a row to the list of Models (in the ModelsDb) if none exists and returns an object of type ModelDatabase, /// representing the new model. In the process creates a new db to hold the new model's data. If thes db has already been created it simply opens a connection to it. /// </summary> /// <param name="sqlServer"></param> /// <param name="modelsDbName"></param> /// <param name="ldaConfig"></param> /// <returns></returns> private static ModelDatabase AddModelParametersToModelsDb(ModelsDb modelsDb, LDAConfig ldaConfig, string modelRepositoryPath, ref bool success) { var metrics = ExtractModelMetrics(ref ldaConfig, modelRepositoryPath); if (metrics == null) { return(null); } string modelDbName; int modelId; try { StatusMessage.Write("Adding metrics to Db: " + ldaConfig.ExtrinsicMetricsProcessed); modelsDb.AddModel(ldaConfig, metrics, out modelDbName, out modelId); success = true; } catch (Exception e) { StatusMessage.Write("Could not add a record to the Topic models db:" + e.ToString()); throw; } var model = new ModelDatabase("", modelsDb.serverName, modelDbName, false); if (model.Open()) { //The database has already been created } return(model); }
private static LDAConfig GetLdaConfig(IDictionary <string, string> parameters) { string modelConfigFile; if (!parameters.TryGetValue(Options.Config, out modelConfigFile)) { StatusMessage.Write("Missing config parameter"); return(null); } return(GetLdaConfig(modelConfigFile)); }
private static ModelsDb GetModelsDb(string sqlServer, string modelsDbName) { try { var metricDictionary = GetMetricsDictionary(); var modelsDb = new ModelsDb(sqlServer, modelsDbName, metricDictionary); modelsDb.Open(); return(modelsDb); } catch (Exception e) { StatusMessage.Write("Could not add a record to the Topic models db:" + e.ToString()); return(null); } }
public static bool Learn(LDAConfig ldaConfig, bool copyFeaturizedDoc) { if (File.Exists(ldaConfig.Model)) { StatusMessage.Write("Skipping, model already exists. " + ldaConfig.Model); return(false); } var featurizedDocFile = ldaConfig.FeaturizedDocuments; if (copyFeaturizedDoc) { featurizedDocFile = string.Format(@"{0}\{1}", Path.GetDirectoryName(ldaConfig.WordTopicAllocations), Path.GetFileName(ldaConfig.FeaturizedDocuments)); File.Copy(ldaConfig.FeaturizedDocuments, featurizedDocFile); } StatusMessage.Write("Running VW to learn LDA..."); var command = AppDomain.CurrentDomain.BaseDirectory + "vw.exe"; var args = " " + featurizedDocFile + " --hash strings" + " --lda " + ldaConfig.LDAParameters.NumTopics + " --lda_alpha " + ldaConfig.LDAParameters.Alpha + " --lda_rho " + ldaConfig.LDAParameters.Rho + " --lda_D " + ldaConfig.ModelStatistics.DocumentCount + " --minibatch " + ldaConfig.LDAParameters.Minibatch + " --power_t " + ldaConfig.LDAParameters.PowerT + " --initial_t " + ldaConfig.LDAParameters.InitialT + " -b " + (int)Math.Ceiling(Math.Log(ldaConfig.ModelStatistics.VocabularySize, 2.0)) + // Gets size of the hash table used to store the topic allocations for each word. " --passes " + ldaConfig.LDAParameters.Passes + " -c " + " --readable_model " + ldaConfig.WordTopicAllocations + " -p " + ldaConfig.DocumentTopicAllocations + " -f " + ldaConfig.Model; Console.RunCommand(command, args); if (copyFeaturizedDoc) { ConsoleColor color; FileManager.DeleteFile(featurizedDocFile, out color); FileManager.DeleteFile(featurizedDocFile + ".cache", out color); } return(true); }
public int Run() { StatusMessage.Write(string.Format("Starting '{0}' for config file\r\n\t{1} ...", CommandOption, ConfigFilePath), ConsoleColor.Green); string result = ExecuteCommandSync(); if (result != null) { StatusMessage.Write(result); return(int.Parse(result.Split('\t')[0])); } else { StatusMessage.Write(string.Format("The {0} process has failed", CommandOption), ConsoleColor.Red); return(1); } }
/// <summary> /// Adds a row to model metrics destination path (tsv file). /// </summary> /// <param name="modelMetricsDestPath"></param> /// <param name="ldaConfig"></param> /// <param name="modelRepositoryPath"></param> /// <param name="success"></param> private static void AddModelParametersToExcel(StreamWriter writer, LDAConfig ldaConfig, string modelRepositoryPath, ref bool success, bool needWriteTableHeader) { var metrics = ExtractModelMetrics(ref ldaConfig, modelRepositoryPath); if (metrics == null) { return; } if (needWriteTableHeader) { // write the table header. StatusMessage.Write("Writing table header"); writer.Write("Locale\tCorpus\tSample\tMin\tMax\tK\tAlpha\tRho\tMinibatch\tPasses\tInitialT\tPowerT"); foreach (var metric in metrics) { writer.Write("\t{0}", metric.Key); } writer.Write("\tmodelName\tmetricsFilePath"); writer.WriteLine(); } StatusMessage.Write("Adding metrics to EXCEL: " + ldaConfig.ExtrinsicMetricsProcessed); writer.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}", ldaConfig.Locale, ldaConfig.Corpus, ldaConfig.SampleName, ldaConfig.FeaturizationParameters.MinWordDocumentFrequency, ldaConfig.FeaturizationParameters.MaxRalativeWordDocumentFrequency, ldaConfig.LDAParameters.NumTopics, ldaConfig.LDAParameters.Alpha, ldaConfig.LDAParameters.Rho, ldaConfig.LDAParameters.Minibatch, ldaConfig.LDAParameters.Passes, ldaConfig.LDAParameters.InitialT, ldaConfig.LDAParameters.PowerT); foreach (var metric in metrics) { writer.Write("\t{0}", metric.Value); } writer.Write("\t{0}\t{1}", ldaConfig.modelName, ldaConfig.ExtrinsicMetricsProcessed); writer.WriteLine(); success = true; }
private static string GetDatabaseName(LDAConfig ldaConfig, IDictionary <string, string> parameters) { var sampleName = ldaConfig.SampleName; if (string.IsNullOrWhiteSpace(sampleName)) { StatusMessage.Write("Sample name not specified."); return(string.Empty); } string dbName; if (!parameters.TryGetValue(Options.DatabaseName, out dbName)) { dbName = string.Format("{0}_{1}_{2}", ldaConfig.SampleName, ldaConfig.FeaturizationParameters.MinWordDocumentFrequency, ldaConfig.FeaturizationParameters.MaxRalativeWordDocumentFrequency); } return(dbName); }
static void Main(string[] args) { if (args.Length < 1) { System.Console.WriteLine("Usage: {0} <command> [options]", AppDomain.CurrentDomain.FriendlyName); System.Console.WriteLine(DetailUsage); Environment.Exit(1); } // new command-line handling var command = args[0].ToLowerInvariant(); var parameters = GetParameters(args); if (!ProcessCommands(command, parameters)) { StatusMessage.Write("Unknown command"); } System.Console.WriteLine("\n\nFinished execution."); }