Ejemplo n.º 1
0
        private bool EnsureCorpusVocabulary()
        {
            if (this.disposed)
            {
                throw new ObjectDisposedException("LDA Object has already been disposed.");
            }


            if (this.corpusVocabulary == null)
            {
                if (File.Exists(this.corpusVocabularyFileName))
                {
                    StatusMessage.Write("LDA.GetTopicAllocations: Loading Corpus Vocabulary");
                    this.corpusVocabulary = CorpusVocabulary.NewInstance(File.ReadLines(this.corpusVocabularyFileName));
                    return(true);
                }
                else
                {
                    StatusMessage.Write(string.Format("LDA.GetTopicAllocations: Error. Cannot find Corpus Vocabulary {0}", this.corpusVocabularyFileName));
                }

                return(false);
            }

            return(true);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Initializes a new instance of the <see cref="LDA"/> class.
        /// </summary>
        /// <param name="numTopic"></param>
        /// <param name="modelFileName"></param>
        /// <param name="corpusVocabularyFileName"></param>
        /// <param name="language"></param>
        private LDA(int numTopic, string modelFileName, string corpusVocabularyFileName, int[] badTopicIds, Language language)
        {
            this.numTopics = numTopic;
            this.dvFactory = DocumentVocabularyFactory.NewInstance(language);
            this.topicAllocationsFactory  = VectorFactory.NewInstance(VectorType.DenseVector, numTopic);
            this.corpusVocabularyFileName = corpusVocabularyFileName;

            StatusMessage.Write(string.Format("LDA: Initializing Vowpal Wabbit Interface with model file {0}", modelFileName));
            this.vwLDAModel = VowpalWabbitInterface.Initialize(string.Format("-i {0} -t --quiet", modelFileName));
            //this.vwLDAModel = new VowpalWabbit(string.Format(CultureInfo.InvariantCulture, "-i {0} -t --quiet", modelFileName));

            this.badTopics = new bool[this.numTopics];
            Array.Clear(this.badTopics, 0, this.numTopics);

            this.RecommendedCompressionType = VectorType.DenseVector; // Default
            int badTopicCount;

            if ((badTopicIds != null) && ((badTopicCount = badTopicIds.Length) > 0))
            {
                foreach (var topicId in badTopicIds)
                {
                    if (topicId < this.numTopics)
                    {
                        this.badTopics[topicId] = true;
                    }
                }

                var sizeOfSparseVector = VectorBase.SizeOfSparseVectors(numTopic, badTopicCount);
                var sizeOfDenseVector  = VectorBase.BytesPerDimension * numTopic;

                this.RecommendedCompressionType = (sizeOfSparseVector < sizeOfDenseVector) ? VectorType.SparseVector : VectorType.DenseVector;
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Adds a row to the list of Models  (in the ModelsDb) if none exists and returns an object of type ModelDatabase,
        /// representing the new model.  In the process creates a new db to hold the new model's data.  If thes db has already been created it simply opens a connection to it.
        /// </summary>
        /// <param name="sqlServer"></param>
        /// <param name="modelsDbName"></param>
        /// <param name="ldaConfig"></param>
        /// <returns></returns>
        private static ModelDatabase AddModelParametersToModelsDb(ModelsDb modelsDb, LDAConfig ldaConfig, string modelRepositoryPath, ref bool success)
        {
            var metrics = ExtractModelMetrics(ref ldaConfig, modelRepositoryPath);

            if (metrics == null)
            {
                return(null);
            }

            string modelDbName;
            int    modelId;

            try
            {
                StatusMessage.Write("Adding metrics to Db: " + ldaConfig.ExtrinsicMetricsProcessed);
                modelsDb.AddModel(ldaConfig, metrics, out modelDbName, out modelId);
                success = true;
            }
            catch (Exception e)
            {
                StatusMessage.Write("Could not add a record to the Topic models db:" + e.ToString());
                throw;
            }

            var model = new ModelDatabase("", modelsDb.serverName, modelDbName, false);

            if (model.Open())
            {
                //The database has already been created
            }

            return(model);
        }
Ejemplo n.º 4
0
        private static LDAConfig GetLdaConfig(IDictionary <string, string> parameters)
        {
            string modelConfigFile;

            if (!parameters.TryGetValue(Options.Config, out modelConfigFile))
            {
                StatusMessage.Write("Missing config parameter");
                return(null);
            }

            return(GetLdaConfig(modelConfigFile));
        }
Ejemplo n.º 5
0
 private static ModelsDb GetModelsDb(string sqlServer, string modelsDbName)
 {
     try
     {
         var metricDictionary = GetMetricsDictionary();
         var modelsDb         = new ModelsDb(sqlServer, modelsDbName, metricDictionary);
         modelsDb.Open();
         return(modelsDb);
     }
     catch (Exception e)
     {
         StatusMessage.Write("Could not add a record to the Topic models db:" + e.ToString());
         return(null);
     }
 }
Ejemplo n.º 6
0
        public static bool Learn(LDAConfig ldaConfig, bool copyFeaturizedDoc)
        {
            if (File.Exists(ldaConfig.Model))
            {
                StatusMessage.Write("Skipping, model already exists. " + ldaConfig.Model);
                return(false);
            }

            var featurizedDocFile = ldaConfig.FeaturizedDocuments;

            if (copyFeaturizedDoc)
            {
                featurizedDocFile = string.Format(@"{0}\{1}", Path.GetDirectoryName(ldaConfig.WordTopicAllocations), Path.GetFileName(ldaConfig.FeaturizedDocuments));
                File.Copy(ldaConfig.FeaturizedDocuments, featurizedDocFile);
            }

            StatusMessage.Write("Running VW to learn LDA...");

            var command = AppDomain.CurrentDomain.BaseDirectory + "vw.exe";

            var args =
                " " + featurizedDocFile +
                " --hash strings" +
                " --lda " + ldaConfig.LDAParameters.NumTopics +
                " --lda_alpha " + ldaConfig.LDAParameters.Alpha +
                " --lda_rho " + ldaConfig.LDAParameters.Rho +
                " --lda_D " + ldaConfig.ModelStatistics.DocumentCount +
                " --minibatch " + ldaConfig.LDAParameters.Minibatch +
                " --power_t " + ldaConfig.LDAParameters.PowerT +
                " --initial_t " + ldaConfig.LDAParameters.InitialT +
                " -b " + (int)Math.Ceiling(Math.Log(ldaConfig.ModelStatistics.VocabularySize, 2.0)) + // Gets size of the hash table used to store the topic allocations for each word.
                " --passes " + ldaConfig.LDAParameters.Passes +
                " -c " +
                " --readable_model " + ldaConfig.WordTopicAllocations +
                " -p " + ldaConfig.DocumentTopicAllocations +
                " -f " + ldaConfig.Model;

            Console.RunCommand(command, args);

            if (copyFeaturizedDoc)
            {
                ConsoleColor color;
                FileManager.DeleteFile(featurizedDocFile, out color);
                FileManager.DeleteFile(featurizedDocFile + ".cache", out color);
            }
            return(true);
        }
Ejemplo n.º 7
0
        public int Run()
        {
            StatusMessage.Write(string.Format("Starting '{0}' for config file\r\n\t{1} ...", CommandOption, ConfigFilePath), ConsoleColor.Green);

            string result = ExecuteCommandSync();

            if (result != null)
            {
                StatusMessage.Write(result);
                return(int.Parse(result.Split('\t')[0]));
            }
            else
            {
                StatusMessage.Write(string.Format("The {0} process has failed", CommandOption), ConsoleColor.Red);
                return(1);
            }
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Adds a row to model metrics destination path (tsv file).
        /// </summary>
        /// <param name="modelMetricsDestPath"></param>
        /// <param name="ldaConfig"></param>
        /// <param name="modelRepositoryPath"></param>
        /// <param name="success"></param>
        private static void AddModelParametersToExcel(StreamWriter writer, LDAConfig ldaConfig, string modelRepositoryPath, ref bool success, bool needWriteTableHeader)
        {
            var metrics = ExtractModelMetrics(ref ldaConfig, modelRepositoryPath);

            if (metrics == null)
            {
                return;
            }

            if (needWriteTableHeader)
            {
                // write the table header.
                StatusMessage.Write("Writing table header");
                writer.Write("Locale\tCorpus\tSample\tMin\tMax\tK\tAlpha\tRho\tMinibatch\tPasses\tInitialT\tPowerT");
                foreach (var metric in metrics)
                {
                    writer.Write("\t{0}", metric.Key);
                }
                writer.Write("\tmodelName\tmetricsFilePath");
                writer.WriteLine();
            }

            StatusMessage.Write("Adding metrics to EXCEL: " + ldaConfig.ExtrinsicMetricsProcessed);
            writer.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}",
                         ldaConfig.Locale,
                         ldaConfig.Corpus,
                         ldaConfig.SampleName,
                         ldaConfig.FeaturizationParameters.MinWordDocumentFrequency,
                         ldaConfig.FeaturizationParameters.MaxRalativeWordDocumentFrequency,
                         ldaConfig.LDAParameters.NumTopics,
                         ldaConfig.LDAParameters.Alpha,
                         ldaConfig.LDAParameters.Rho,
                         ldaConfig.LDAParameters.Minibatch,
                         ldaConfig.LDAParameters.Passes,
                         ldaConfig.LDAParameters.InitialT,
                         ldaConfig.LDAParameters.PowerT);
            foreach (var metric in metrics)
            {
                writer.Write("\t{0}", metric.Value);
            }
            writer.Write("\t{0}\t{1}", ldaConfig.modelName, ldaConfig.ExtrinsicMetricsProcessed);
            writer.WriteLine();
            success = true;
        }
Ejemplo n.º 9
0
        private static string GetDatabaseName(LDAConfig ldaConfig, IDictionary <string, string> parameters)
        {
            var sampleName = ldaConfig.SampleName;

            if (string.IsNullOrWhiteSpace(sampleName))
            {
                StatusMessage.Write("Sample name not specified.");
                return(string.Empty);
            }

            string dbName;

            if (!parameters.TryGetValue(Options.DatabaseName, out dbName))
            {
                dbName = string.Format("{0}_{1}_{2}", ldaConfig.SampleName, ldaConfig.FeaturizationParameters.MinWordDocumentFrequency, ldaConfig.FeaturizationParameters.MaxRalativeWordDocumentFrequency);
            }

            return(dbName);
        }
Ejemplo n.º 10
0
        static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                System.Console.WriteLine("Usage: {0} <command> [options]", AppDomain.CurrentDomain.FriendlyName);
                System.Console.WriteLine(DetailUsage);
                Environment.Exit(1);
            }

            // new command-line handling
            var command    = args[0].ToLowerInvariant();
            var parameters = GetParameters(args);

            if (!ProcessCommands(command, parameters))
            {
                StatusMessage.Write("Unknown command");
            }

            System.Console.WriteLine("\n\nFinished execution.");
        }