示例#1
0
        /// <summary>
        /// Loads the classifier provided by the <c>classifierFactory</c> which is modelled using the specified <c>model_file</c>.
        /// Then, the classifier is used to evaluate the accuracy of the <c>vector_data</c>.
        /// A report on the classification details is printed to the <c>output_file</c>.
        /// </summary>
        /// <param name="model_file">A file containing a serialization of the classifier model.</param>
        /// <param name="sys_output">A report on the classification details.</param>
        /// <param name="classifierFactory">Provides the necessary classifier.</param>
        internal static double ReportOnModel(
            FeatureVectorFile vectorFile
            , string sys_output
            , Func <TextIdMapper, TextIdMapper, Classifier> classifierFactory
            , Func <Classifier, List <FeatureVector>, TextIdMapper, TextIdMapper, string[]> getDetailsFunc
            )
        {
            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            Classifier classifier = classifierFactory(classToClassId, featureToFeatureId);

            var vectors     = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses = vectorFile.Headers[gold_i];

            var systemClasses = classifier.Classify(vectors);

            string[] details = getDetailsFunc(classifier, vectors, classToClassId, featureToFeatureId);

            var accuracy = ProgramOutput.GenerateSysOutput(sys_output, FileCreationMode.CreateNew, vectors, classToClassId, goldClasses, systemClasses, details, heading: Path.GetFileName(vectorFile.Path));

            return(accuracy);
        }
示例#2
0
        // Methods

        public override double ExecuteCommand()
        {
            int instanceName_i = 0;
            int gold_i         = 1;

            featureToFeatureId = new TextIdMapper();
            classToClassId     = new TextIdMapper();
            var instanceNameToId = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { instanceNameToId, classToClassId };

            FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 2, featureDelimiter: ' ', isSortRequired: false);

            // Read the boundaries:
            int[] sentenceLengths = ReadBoundaryFile(boundary_file);

            // Read the classifier model:
            classifier = MaxEntPOSClassifier.LoadModel(model_file, classToClassId, featureToFeatureId);

            // Read the vectors:
            var testVectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Continuous);

            // Get the output ready for display.
            int[]    goldClassIds    = vectorFile.Headers[gold_i];
            int[]    instanceNameIds = vectorFile.Headers[instanceName_i];
            string[] instanceNames   = instanceNameToId.GetValues(instanceNameIds);

            // Generate sys_output:
            ConfusionMatrix confusionMatrix;

            File.WriteAllText(sys_output, GenerateSysOutput(instanceNames, testVectors, sentenceLengths, out confusionMatrix, gold_i));
            return(confusionMatrix.CalculateAccuracy());
        }
示例#3
0
 private void Serialize_Recursive(StringBuilder sb, TextIdMapper classToClassId, TextIdMapper wordToWordId, int depth)
 {
     // If is is a leaf node, ...
     if (TrueBranch == null && FalseBranch == null)
     {
         string path = GetPath(wordToWordId);
         sb.AppendFormat("{0} {1}", path, FeatureVectors.Count);
         double[] distribution = GetDistributionByClass();
         for (int i = 0; i < distribution.Length; i++)
         {
             sb.AppendFormat(" {0} {1}", classToClassId[i], distribution[i]);
         }
         sb.AppendLine();
     }
     // If it is not a leaf node, ...
     else
     {
         if (FalseBranch != null)
         {
             FalseBranch.Serialize_Recursive(sb, classToClassId, wordToWordId, depth + 1);
         }
         if (TrueBranch != null)
         {
             TrueBranch.Serialize_Recursive(sb, classToClassId, wordToWordId, depth + 1);
         }
     }
 }
示例#4
0
        /// <summary>Loads a TBL classidier from the model_file at the specifiedl location.</summary>
        public static TBLClassifier LoadModel(string model_file, TextIdMapper classToClassId, TextIdMapper featureToFeatureId, int N, int gold_i)
        {
            int defaultClass;
            List <Transformation> transformations = new List <Transformation>();

            using (StreamReader sr = File.OpenText(model_file))
            {
                // Read the default class, which is presented in the first line:
                string line = sr.ReadLine();
                defaultClass = classToClassId[line.Trim()];
                // Read each of the transitions stored in the model file:
                Regex parser = new Regex(@"(?<featName>[^\s]+)\s+(?<from_class>[^\s]+)\s+(?<to_class>[^\s]+)\s+(?<net_gain>[^\s]+)");
                while (!sr.EndOfStream && transformations.Count < N)
                {
                    line = sr.ReadLine();
                    var match = parser.Match(line);

                    int feat_id       = featureToFeatureId[match.Groups["featName"].Value.Trim()];
                    int from_class_id = classToClassId[match.Groups["from_class"].Value.Trim()];
                    int to_class_id   = classToClassId[match.Groups["to_class"].Value.Trim()];
                    int net_gain      = int.Parse(match.Groups["net_gain"].Value.Trim());
                    transformations.Add(new Transformation(feat_id, from_class_id, to_class_id, net_gain));
                }
            }
            // Set the minimum gain to -1 (an invalid value) to indicate that the model has been loaded.
            return(new TBLClassifier(transformations, classToClassId.Count, defaultClass, gold_i));
        }
示例#5
0
        // Methods

        /// <summary>Reports the accuracy of the classifier, based on the specified <c>confusionMatrix</c>.</summary>
        /// <param name="confusionMatrix">The confusion matrix to report.</param>
        /// <param name="classToclassId">A lookup that maps class identifiers to their text representations.</param>
        /// <param name="reportTitle">A text description of the set of data being classfied.</param>
        private static double ReportAccuracy(ConfusionMatrix confusionMatrix, TextIdMapper classToclassId, string reportTitle)
        {
            // Write column headers:
            Console.WriteLine("Confusion matrix for '{0}':", reportTitle);
            Console.WriteLine("row is the truth, column is the system output");
            Console.WriteLine();
            Console.Write("            ");
            for (int i = 0; i < confusionMatrix.NoOfDimensions; i++)
            {
                Console.Write(" {0}", classToclassId[i]);
            }
            Console.WriteLine();
            // Write rows.
            for (int i = 0; i < confusionMatrix.NoOfDimensions; i++)
            {
                // Write row header:
                Console.Write("{0}", classToclassId[i]);
                // Write cells:
                for (int j = 0; j < confusionMatrix.NoOfDimensions; j++)
                {
                    Console.Write("\t{0}", confusionMatrix[i, j]);
                }
                Console.WriteLine();
            }
            Console.WriteLine();
            double accuracy = confusionMatrix.CalculateAccuracy();

            Console.WriteLine($"  {reportTitle} accuracy={accuracy:0.00000}");
            Console.WriteLine();
            return(accuracy);
        }
示例#6
0
        /// <summary>Saves this model to the specifiedl location.</summary>
        public void SaveModel(string model_file, TextIdMapper classToClassId, TextIdMapper featureToFeatureId)
        {
            // Make sure that training has been performed.
            if (!HasTrained)
            {
                PerformTraining();
            }

            using (StreamWriter sw = File.CreateText(model_file))
            {
                // The first line contains the default classname (i.e., the first class in the training data),
                string defaultClassName = classToClassId[0];
                sw.WriteLine($"{defaultClassName} ");
                // Then, write the list of transformation (1 x transformation per line),
                foreach (var transformation in _transformations)
                {
                    //   Each transformation line:   format: featName from_class to_class net_gain
                    string featName   = featureToFeatureId[transformation.FeatureId];
                    string to_class   = classToClassId[transformation.ToClass];
                    string from_class = classToClassId[transformation.FromClass];
                    int    net_gain   = transformation.NetGain;
                    sw.WriteLine($"{featName} {from_class} {to_class} {net_gain}");
                }
            }
        }
示例#7
0
        // Public Methods

        public override double ExecuteCommand()
        {
            FeatureVectorFile vectorFile_train = new FeatureVectorFile(path: training_data_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: true);
            FeatureVectorFile vectorFile_test  = new FeatureVectorFile(path: test_data_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: true);

            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var trainingVectors   = vectorFile_train.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_train = vectorFile_train.Headers[gold_i];

            var testVectors      = vectorFile_test.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_test = vectorFile_test.Headers[gold_i];

            var classifier = new kNNClassifier(k_val, (SimilarityFunction)similarity_func, trainingVectors, classToClassId.Count, gold_i);

            var systemClasses_train = classifier.Classify(trainingVectors);
            var systemClasses_test  = classifier.Classify(testVectors);

            var details_train = ProgramOutput.GetDistributionDetails(classifier, trainingVectors, classToClassId);
            var details_test  = ProgramOutput.GetDistributionDetails(classifier, testVectors, classToClassId);

            ProgramOutput.GenerateSysOutput(sys_output, FileCreationMode.CreateNew, trainingVectors, classToClassId, goldClasses_train, systemClasses_train, details_train, "training data");
            var testAccuracy = ProgramOutput.GenerateSysOutput(sys_output, FileCreationMode.Append, testVectors, classToClassId, goldClasses_test, systemClasses_test, details_test, "test data");

            return(testAccuracy);
        }
示例#8
0
        /// <summary>
        /// Trains the classifier provided by the <c>classifierFactory</c> on the <c>training_data</c>.
        /// Then, the classifier is used to evaluate the accuracy of both the <c>training_data</c> and <c>test_data</c>.
        /// A report on the classification details is printed to the <c>output_file</c>.
        /// </summary>
        /// <param name="output_file">A report on the classification details.</param>
        /// <param name="classifierFactory">Provides the necessary classifier.</param>
        internal static void ReportOnTrainingAndTesting(
            FeatureVectorFile vectorFile_train
            , FeatureVectorFile vectorFile_test
            , string output_file
            , Func <List <FeatureVector>, int, int, Classifier> classifierFactory
            , Func <Classifier, List <FeatureVector>, TextIdMapper, string[]> getDetailsFunc
            )
        {
            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var trainingVectors   = vectorFile_train.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_train = vectorFile_train.Headers[gold_i];

            var testVectors      = vectorFile_test.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_test = vectorFile_test.Headers[gold_i];

            Classifier classifier = classifierFactory(trainingVectors, classToClassId.Count, gold_i);

            var systemClasses_train = classifier.Classify(trainingVectors);
            var systemClasses_test  = classifier.Classify(testVectors);

            var details_train = ProgramOutput.GetDistributionDetails(classifier, trainingVectors, classToClassId);
            var details_test  = ProgramOutput.GetDistributionDetails(classifier, testVectors, classToClassId);

            ProgramOutput.GenerateSysOutput(output_file, FileCreationMode.CreateNew, trainingVectors, classToClassId, goldClasses_train, systemClasses_train, details_train, "training data");
            ProgramOutput.GenerateSysOutput(output_file, FileCreationMode.Append, testVectors, classToClassId, goldClasses_test, systemClasses_test, details_test, "test data");
        }
示例#9
0
            // Methods

            internal string GetModelAsText(TextIdMapper classToClassId, TextIdMapper wordToWordId)
            {
                /// TODO: Move a serialization method to the classifier class.
                StringBuilder sb = new StringBuilder();

                Serialize_Recursive(sb, classToClassId, wordToWordId, 0);
                return(sb.ToString());
            }
        // Static Methods

        public new static MaxEntPOSClassifier LoadModel(string model_file, TextIdMapper classToClassId, TextIdMapper featureToFeatureId)
        {
            string               text = File.ReadAllText(model_file);
            List <double>        lambda_c;
            List <FeatureVector> vectors;

            LoadModel(text, classToClassId, featureToFeatureId, out lambda_c, out vectors);

            return(new MaxEntPOSClassifier(vectors, classToClassId.Count, lambda_c.ToArray()));
        }
示例#11
0
        // Private Methods

        /// <summary>
        /// Outputs the classification result as follows: {instanceName} {gold_class_label} {sys_class_label} {probability}.
        /// </summary>
        /// <param name="output_file">The location of the sys_output file.</param>
        /// <param name="vectors">A collection of vectors to classify.</param>
        /// <param name="classToclassId">A class for providing human-readable class labels.</param>
        /// <param name="heading">Usually, "Training" or "Test".</param>
        internal static double GenerateSysOutput(
            string output_file
            , FileCreationMode fileCreationMode
            , List <FeatureVector> vectors
            , TextIdMapper classToclassId
            , int[] goldClasses
            , int[] systemClasses
            , string[] details
            , string heading
            )
        {
            Debug.Assert(vectors != null && vectors.Count > 0);
            Debug.Assert(systemClasses != null && systemClasses.Length == vectors.Count);

            StreamWriter writer = null;

            try
            {
                switch (fileCreationMode)
                {
                case FileCreationMode.CreateNew: writer = File.CreateText(output_file); break;

                case FileCreationMode.Append:    writer = File.AppendText(output_file); break;

                default: throw new Exception($"Internal error: ProgramOutput.FileCreationMode with value '{fileCreationMode}' is not supported by this version of the application.");
                }

                writer.Write($"%%%%% {heading}:{Environment.NewLine}");

                // For each of the vectors, ...
                var confusionMatrix = new ConfusionMatrix(classToclassId.Count);
                for (int v_i = 0; v_i < vectors.Count; v_i++)
                {
                    string trueLabel = classToclassId[goldClasses[v_i]];
                    string sysLabel  = classToclassId[systemClasses[v_i]];

                    // Output the {true_class_label} {details}
                    writer.WriteLine($"{trueLabel}\t{sysLabel}\t{details[v_i]}");
                    confusionMatrix[goldClasses[v_i], systemClasses[v_i]]++;
                }
                writer.WriteLine();
                double accuracy = ReportAccuracy(confusionMatrix, classToclassId, heading);
                return(accuracy);
            }
            finally
            {
                if (writer != null)
                {
                    writer.Close();
                }
            }
        }
示例#12
0
        // Methods

        public override bool ExecuteCommand()
        {
            // Initialize the text-to-Id mappers:
            int gold_i = 0;

            featureToFeatureId = new TextIdMapper();
            classToClassId     = new TextIdMapper();
            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            // Workaround: Read everything from STDIN to a file. (Files are used as the text source throughout this application.)
            var svmLight_data = Console.In.ReadToEnd();

            Console.Error.WriteLine("{0} characters of input received.", svmLight_data.Length);
            string tempFile = Path.GetTempFileName();

            int[] goldClasses;
            List <FeatureVector> vectors;

            try
            {
                File.WriteAllText(tempFile, svmLight_data);
                FeatureVectorFile vectorFile = new FeatureVectorFile(path: tempFile, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: false);

                vectors     = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
                goldClasses = vectorFile.Headers[gold_i];
            }
            finally
            {
                File.Delete(tempFile);
            }
            Debug.Assert(vectors.Count > 0);

            IdValuePair <double>[] chiSquare = new IdValuePair <double> [featureToFeatureId.Count];
            //TODO: Make the implementation less binary dependent (i.e. the hardcoded 2 below).
            double[][,] contingencyTable_f = new double[featureToFeatureId.Count][, ];
            for (int f_i = 0; f_i < featureToFeatureId.Count; f_i++)
            {
                // Create a contingency table for this vector.
                contingencyTable_f[f_i] = new double[classToClassId.Count, 2];
                for (int v_i = 0; v_i < vectors.Count; v_i++)
                {
                    FeatureVector v = vectors[v_i];
                    contingencyTable_f[f_i][v.Headers[gold_i], (int)v.Features[f_i]]++;
                }
                chiSquare[f_i] = new IdValuePair <double>(f_i, StatisticsHelper.CalculateChiSquare(contingencyTable_f[f_i]));
            }
            ReportChiSquareResults(contingencyTable_f, chiSquare);
            return(true);
        }
示例#13
0
        // Public Methods

        /// <summary>
        /// Displays the vector
        /// </summary>
        /// <param name="featureToFeatureId"></param>
        /// <returns></returns>
        public string Display(TextIdMapper featureToFeatureId)
        {
            StringBuilder sb      = new StringBuilder();
            bool          isFirst = true;

            foreach (int u_i in UsedFeatures)
            {
                if (isFirst)
                {
                    isFirst = false;
                }
                else
                {
                    sb.AppendFormat(" ");
                }
                sb.AppendFormat("{0}:{1:0.#####}", featureToFeatureId[u_i], Features[u_i]);
            }
            return(sb.ToString());
        }
示例#14
0
        // Methods

        public override double ExecuteCommand()
        {
            FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 1, featureDelimiter: ':', isSortRequired: false);

            // Initialize the text-to-Id mappers:
            featureToFeatureId = new TextIdMapper();
            int instanceName_i = 0;
            int gold_i         = 1;

            classToClassId = new TextIdMapper();
            var instanceNameToInstanceNameId = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[]
            {
                instanceNameToInstanceNameId
                , classToClassId
            };

            // Read the boundaries:
            int[] sentenceLengths = ReadBoundaryFile(boundary_file);

            // Read the classifier model:
            classifier = MaxEntPOSClassifier.LoadModel(model_file, classToClassId, featureToFeatureId);

            // Read the vectors:
            var testVectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Continuous);

            int[] instanceNameIds = vectorFile.Headers[instanceName_i];
            int[] goldClasses     = vectorFile.Headers[gold_i];

            // TODO: Neaten this up a little.
            string[] instanceNames = new string[instanceNameIds.Length];
            for (int i = 0; i < instanceNameIds.Length; i++)
            {
                int instanceNameId = instanceNameIds[i];
                instanceNames[i] = headerToHeaderIds[instanceName_i][i];
            }

            // Generate sys_output:
            var confusionMatrix = GenerateSysOutput(sys_output, instanceNames, testVectors, sentenceLengths, gold_i);

            return(confusionMatrix.CalculateAccuracy());
        }
        // Methods

        public override bool ExecuteCommand()
        {
            FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: false);

            // Load the training file.
            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var trainingVectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses     = vectorFile.Headers[gold_i];

            double[,] observation, expectation;
            CalculateObservationAndEmpiricalExpectation(trainingVectors, out observation, out expectation);

            OutputEmpiricalCount(observation, expectation);
            return(true);
        }
        public override double ExecuteCommand()
        {
            FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 1, featureDelimiter: ':', isSortRequired: true);
            FeatureVectorFile modelFile  = new FeatureVectorFile(path: model_file, noOfHeaderColumns: 1, featureDelimiter: ':', isSortRequired: true);

            int alphaColumn_i = 0;

            TextIdMapper[] headerToHeaderIds_model = new TextIdMapper[modelFile.NoOfHeaderColumns];
            headerToHeaderIds_model[alphaColumn_i] = new TextIdMapper();

            var accuracy = Program.ReportOnModel(vectorFile, sys_output
                                                 , classifierFactory: (classToClassId, featureToFeatureId) =>
            {
                return(SVMClassifier.LoadModel(modelFile, classToClassId, featureToFeatureId, alphaColumn_i, headerToHeaderIds_model));
            }
                                                 , getDetailsFunc: GetDetails
                                                 );

            return(accuracy);
        }
示例#17
0
            public string GetPath(TextIdMapper wordToWordId)
            {
                if (Parent == null)
                {
                    //Debug.Assert(depth == 0);
                    return(string.Empty);
                }
                string featureName = wordToWordId[this.Parent.f_i];

                if (object.ReferenceEquals(this, Parent.FalseBranch))
                {
                    featureName = "!" + featureName;
                }
                string parentFeature = Parent.GetPath(wordToWordId);

                if (parentFeature == string.Empty)
                {
                    return(featureName);
                }
                return(parentFeature + "&" + featureName);
            }
示例#18
0
        /// <summary>
        /// Trains a classifier on the specified <c>train_data</c>.
        /// Output the model to the specified <c>model_file</c>.
        /// </summary>
        /// <param name="model_file">A file containing a serialization of the classifier model.</param>
        /// <param name="classifierFactory">Provides the necessary classifier, which must implement ISaveModel.</param>
        internal static void TrainModel <T>(
            FeatureVectorFile vector_file
            , string model_file
            , Func <List <FeatureVector>, TextIdMapper, TextIdMapper, T> classifierFactory
            )
            where T : Classifier, ISaveModel
        {
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var vectors = vector_file.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);

            T classifier = classifierFactory(vectors, classToClassId, featureToFeatureId);

            //var systemClasses =
            classifier.Classify(vectors);

            classifier.SaveModel(model_file, classToClassId, featureToFeatureId);
        }
示例#19
0
        // Methods

        public override bool ExecuteCommand()
        {
            FeatureVectorFile vectorFile_train = new FeatureVectorFile(path: training_data_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: false);

            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var trainingVectors   = vectorFile_train.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_train = vectorFile_train.Headers[gold_i];

            // model_file is optional.
            Func <int, FeatureVector, double> calculate_Prob_c_v;

            // If it is not given, p(v|c_i) = 1/|C|, where |C| is the number of class_labels.
            if (string.IsNullOrWhiteSpace(model_file))
            {
                double kProbability = 1D / classToClassId.Count;
                calculate_Prob_c_v = (v, c_i) => { return(kProbability); };
            }
            // If it is given, it is used to calculate p(y|xi).
            else
            {
                MaxEntClassifier classifier = MaxEntClassifier.LoadModel(model_file, classToClassId, featureToFeatureId);
                calculate_Prob_c_v =
                    (c_i, v) =>
                {
                    double[] details;
                    int      sysClass = classifier.Classify(v, out details);
                    return(details[c_i]);
                };
            }

            double[,] expectation = CalculateModelExpectation(trainingVectors, calculate_Prob_c_v);

            OutputEmpiricalCount(expectation, trainingVectors.Count, requiresSort: true);
            return(true);
        }
        private static string[] GetDetails(Classifier classifier, List <FeatureVector> vectors, TextIdMapper classToClassId, TextIdMapper featureToFeatureId)
        {
            TBLClassifier tblClassifier = (TBLClassifier)classifier;
            var           systemClasses = new int[vectors.Count];
            var           details       = new string[vectors.Count];

            for (int v_i = 0; v_i < vectors.Count; v_i++)
            {
                StringBuilder sb           = new StringBuilder();
                int           currentClass = tblClassifier.DefaultClass;
                foreach (TBLClassifier.Transformation t in tblClassifier.Transformations)
                {
                    int newClass = tblClassifier.Transform(currentClass, t, vectors[v_i]);
                    if (newClass == currentClass)
                    {
                        continue;
                    }

                    string featName   = featureToFeatureId[t.FeatureId];
                    string from_class = classToClassId[t.FromClass];
                    string to_class   = classToClassId[t.ToClass];
                    sb.AppendFormat($" {featName} {from_class} {to_class}");
                    currentClass = newClass;
                }
                systemClasses[v_i] = currentClass;
                details[v_i]       = sb.ToString();
            }
            return(details);
        }
示例#21
0
        public static string[] GetDistributionDetails(Classifier classifier, List <FeatureVector> vectors, TextIdMapper classToClassId)
        {
            string[] details = new string[vectors.Count];
            for (int v_i = 0; v_i < vectors.Count; v_i++)
            {
                double[] distribution;
                //int sysClass =
                classifier.Classify(vectors[v_i], out distribution);
                var distribution_sorted = SearchHelper.GetMaxNItems(distribution.Length, distribution);

                // Output the results:
                StringBuilder sb = new StringBuilder();
                foreach (var classId in distribution_sorted)
                {
                    // Output the results:
                    sb.AppendFormat("\t{0}\t{1:0.00000}", classToClassId[classId], distribution[classId]);
                }
                details[v_i] = sb.ToString();
            }
            return(details);
        }
示例#22
0
        protected static void LoadModel(string text, TextIdMapper classToClassId, TextIdMapper featureToFeatureId, out List <double> lambda_c, out List <FeatureVector> vectors)
        {
            var probability_c_uf = new Dictionary <int, Dictionary <int, double> >();

            lambda_c = new List <double>();
            int    classId          = -1;
            string className        = null;
            Regex  classNamePattern = new Regex(@"FEATURES FOR CLASS (?<className>.+)");
            Regex  featurePattern   = new Regex(@"(?<feature>\S+)\s+(?<probability>.+)");
            int    lineNo           = 0;

            foreach (var line in TextHelper.SplitOnNewline(text))
            {
                lineNo++;
                Match match = classNamePattern.Match(line);
                // Branch A: Update the class name.
                if (match.Groups.Count > 1)
                {
                    className = match.Groups["className"].Value;
                    int newClassId = classToClassId[className];

                    // If the class changes, make sure that the dictionary for it exists.
                    if (newClassId != classId)
                    {
                        if (probability_c_uf.ContainsKey(newClassId))
                        {
                            Console.Error.WriteLine("Line {0}:\t Category {1} might be listed twice.", lineNo, className);
                        }
                        else
                        {
                            probability_c_uf[newClassId] = new Dictionary <int, double>();
                        }
                    }
                    classId = newClassId;
                }
                // Branch B: Add a new feature.
                else
                {
                    Debug.Assert(classId != -1);
                    Match featureMatch = featurePattern.Match(line);
                    if (featureMatch.Groups.Count > 2)
                    {
                        string featureName = featureMatch.Groups["feature"].Value;
                        double probability = double.Parse(featureMatch.Groups["probability"].Value);

                        // Treat the default values slightly differently.
                        if (featureName == "<default>")
                        {
                            Debug.Assert(classId == lambda_c.Count);
                            lambda_c.Add(probability);
                        }
                        else
                        {
                            int featureId = featureToFeatureId[featureName];
                            // Check that the inner dictionary exists.
                            if (probability_c_uf[classId].ContainsKey(featureId))
                            {
                                Console.Error.WriteLine("Line {0}:\tFeature: {1} appears twice in category {2}.", lineNo, featureName, className);
                            }
                            probability_c_uf[classId][featureId] = probability;
                        }
                    }
                }
            }

            // Create feature vectors based on the information we've extracted.
            vectors = new List <FeatureVector>();
            foreach (int c_i in probability_c_uf.Keys)
            {
                ValueCollection features = new ValueCollection(featureToFeatureId.Count);
                foreach (int usedFeatureId in probability_c_uf[c_i].Keys)
                {
                    features[usedFeatureId] = probability_c_uf[c_i][usedFeatureId];
                }
                FeatureVector vector = new FeatureVector(new int[] { c_i }, features, probability_c_uf[c_i].Keys.ToArray(), false);
                vectors.Add(vector);
            }
        }
        /// <summary>Loads and returns a collection of FeatureVectors from the specified <c>uri</c>.</summary>
        /// <param name="uri">A file, storing the features in SVM format.</param>
        /// <param name="featureToFeatureId">
        /// A mapping between the feature's text values and internal numeric identifiers that represents these value.
        /// </param>
        /// <param name="classToClassId">
        /// A mapping between class's names and internal numeric identifiers that represents these class names.
        /// </param>
        /// <param name="transformationCount"></param>
        /// <returns></returns>
        public List <FeatureVector> LoadFromSVMLight(
            TextIdMapper featureToFeatureId
            , TextIdMapper[] headerToHeaderIds
            , FeatureType featureType)
        {
            Debug.Assert(headerToHeaderIds != null && headerToHeaderIds.Length == this.NoOfHeaderColumns);

            // Step 1: Read the data file:
            string[] lines = File.ReadAllLines(this.Path);

            var wordBags_i = new List <Dictionary <int, int> >();

            // Now that we know the number of lines, we can create the arrays for storing the header columns.
            for (int j = 0; j < Headers.Length; j++)
            {
                Headers[j] = new int[lines.Length];
                Debug.Assert(headerToHeaderIds[j] != null);
            }

            // Store the header rows:
            HeaderRows = new string[NoOfHeaderRows];
            for (int i = 0; i < NoOfHeaderRows; i++)
            {
                HeaderRows[i] = lines[i];
            }

            // Parse 1: Iterate over each of the rows:
            for (int i = NoOfHeaderRows; i < lines.Length; i++)
            {
                string line   = lines[i];
                var    chunks = TextHelper.SplitOnWhitespaceOr(line, FeatureDelimiter);

                // The first chunk contains the class:
                int j = 0;
                for (; j < Headers.Length; j++)
                {
                    Headers[j][i - NoOfHeaderRows] = headerToHeaderIds[j][chunks[j]];
                }

                // For each of the words in the document, ...
                var wordToWordCount = new Dictionary <int, int>();
                for (; j < chunks.Length; j += 2)
                {
                    int count     = Int32.Parse(chunks[j + 1]);
                    var featureId = featureToFeatureId[chunks[j]];
                    // Add this count to the existing sum:
                    int sum;
                    if (!wordToWordCount.TryGetValue(featureId, out sum))
                    {
                        sum = 0;
                    }
                    wordToWordCount[featureId] = sum + count;
                }
                wordBags_i.Add(wordToWordCount);
            }

            // Parse 2:
            // This array is a matrix where each row represents a class and each column represents a word in our dictionary
            // (where the dictionary itself is a dictionary of ALL words in ALL classes).
            var vectors = new List <FeatureVector>();

            for (int i = NoOfHeaderRows; i < lines.Length; i++)
            {
                var   wordCounts   = wordBags_i[i - NoOfHeaderRows];
                var   allFeatures  = new ValueCollection(featureToFeatureId.Count);
                var   usedFeatures = new int[wordCounts.Keys.Count];
                int[] headers_j    = new int[NoOfHeaderColumns];
                for (int j = 0; j < NoOfHeaderColumns; j++)
                {
                    headers_j[j] = Headers[j][i - NoOfHeaderRows];
                }
                int w_i = 0;
                foreach (int f_i in wordCounts.Keys)
                {
                    allFeatures[f_i]    = GetFeatureValue(featureType, wordCounts[f_i]);
                    usedFeatures[w_i++] = f_i;
                }
                vectors.Add(new FeatureVector(headers_j, allFeatures, usedFeatures, IsSortRequired));
            }
            return(vectors);
        }
        private static string[] GetDetails(Classifier classifier, List <FeatureVector> vectors, TextIdMapper classToClassId, TextIdMapper featureToFeatureId)
        {
            var detailsAsText = new string[vectors.Count];

            for (int v_i = 0; v_i < vectors.Count; v_i++)
            {
                double[] details;
                //int sysClass =
                classifier.Classify(vectors[v_i], out details);
                Debug.Assert(details.Length == 1);
                detailsAsText[v_i] = string.Format($"{details[0]:0.00000}");
            }
            return(detailsAsText);
        }
示例#25
0
        public static Classifier LoadModel(FeatureVectorFile vectorFile_model, TextIdMapper classToclassId, TextIdMapper featureToFeatureId, int alphaColumn_i, TextIdMapper[] headerToHeaderIds)
        {
            // Peek into the file to see what type of SVM model this is:
            int i = 0;
            LibSVM_KernelType kernel_type = LibSVM_KernelType.linear;

            foreach (var line in File.ReadLines(vectorFile_model.Path))
            {
                if (i == 0)
                {
                    Debug.Assert(line.StartsWith("svm_type") && line.EndsWith("c_svc"));
                }
                else if (i == 1)
                {
                    kernel_type = (LibSVM_KernelType)Enum.Parse(typeof(LibSVM_KernelType), line.Substring(line.LastIndexOfAny(TextHelper.WhiteSpace)));
                }
                else
                {
                    break;
                }
                i++;
            }

            // Override the number of header rows according to the model type.
            switch (kernel_type)
            {
            case LibSVM_KernelType.linear: vectorFile_model.NoOfHeaderRows = 8; break;

            case LibSVM_KernelType.polynomial: vectorFile_model.NoOfHeaderRows = 11; break;

            case LibSVM_KernelType.rbf: vectorFile_model.NoOfHeaderRows = 9; break;

            case LibSVM_KernelType.sigmoid: vectorFile_model.NoOfHeaderRows = 10; break;

            default: throw new NotImplementedException();
            }

            // Read each of the support vectors:
            var modelVectors = vectorFile_model.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Continuous);

            // Read the model file header:
            double rho    = 0;
            double gamma  = 0;
            double coef   = 0;
            double degree = 0;

            Debug.Assert(vectorFile_model.HeaderRows[vectorFile_model.NoOfHeaderRows - 1] == "SV");
            for (i = 2; i < vectorFile_model.NoOfHeaderRows - 1; i++)
            {
                string line = vectorFile_model.HeaderRows[i];

                // Ignore non-informative meta-data:
                if (line.StartsWith("nr_class") || line.StartsWith("total_sv") || line.StartsWith("label") || line.StartsWith("nr_sv"))
                {
                    continue;
                }

                string text = line.Substring(line.LastIndexOfAny(TextHelper.WhiteSpace));
                if (line.StartsWith("rho"))
                {
                    rho = double.Parse(text);
                }
                else if (line.StartsWith("gamma"))
                {
                    gamma = double.Parse(text);
                }
                else if (line.StartsWith("degree"))
                {
                    degree = double.Parse(text);
                }
                else if (line.StartsWith("coef"))
                {
                    coef = double.Parse(text);
                }
                else
                {
                    throw new NotImplementedException();
                }
            }

            double[] weights = new double[modelVectors.Count];
            for (i = 0; i < weights.Length; i++)
            {
                weights[i] = Convert.ToDouble(headerToHeaderIds[alphaColumn_i][vectorFile_model.Headers[alphaColumn_i][i]]);
            }

            switch (kernel_type)
            {
            case LibSVM_KernelType.linear:
                return(new LibSVMClassifier_Linear(modelVectors, weights, rho));

            case LibSVM_KernelType.polynomial:
                return(new LibSVMClassifier_Polynomial(modelVectors, weights, rho, degree, gamma, coef));

            case LibSVM_KernelType.rbf:
                return(new LibSVMClassifier_RBF(modelVectors, weights, rho, gamma));

            case LibSVM_KernelType.sigmoid:
                return(new LibSVMClassifier_Sigmoid(modelVectors, weights, rho, gamma, coef));

            default:
                throw new NotImplementedException();
            }
        }