Exemplo n.º 1
0
        // Public Methods

        public override double ExecuteCommand()
        {
            FeatureVectorFile vectorFile_train = new FeatureVectorFile(path: training_data_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: true);
            FeatureVectorFile vectorFile_test  = new FeatureVectorFile(path: test_data_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: true);

            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var trainingVectors   = vectorFile_train.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_train = vectorFile_train.Headers[gold_i];

            var testVectors      = vectorFile_test.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_test = vectorFile_test.Headers[gold_i];

            var classifier = new kNNClassifier(k_val, (SimilarityFunction)similarity_func, trainingVectors, classToClassId.Count, gold_i);

            var systemClasses_train = classifier.Classify(trainingVectors);
            var systemClasses_test  = classifier.Classify(testVectors);

            var details_train = ProgramOutput.GetDistributionDetails(classifier, trainingVectors, classToClassId);
            var details_test  = ProgramOutput.GetDistributionDetails(classifier, testVectors, classToClassId);

            ProgramOutput.GenerateSysOutput(sys_output, FileCreationMode.CreateNew, trainingVectors, classToClassId, goldClasses_train, systemClasses_train, details_train, "training data");
            var testAccuracy = ProgramOutput.GenerateSysOutput(sys_output, FileCreationMode.Append, testVectors, classToClassId, goldClasses_test, systemClasses_test, details_test, "test data");

            return(testAccuracy);
        }
Exemplo n.º 2
0
        // Methods

        public override double ExecuteCommand()
        {
            int instanceName_i = 0;
            int gold_i         = 1;

            featureToFeatureId = new TextIdMapper();
            classToClassId     = new TextIdMapper();
            var instanceNameToId = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { instanceNameToId, classToClassId };

            FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 2, featureDelimiter: ' ', isSortRequired: false);

            // Read the boundaries:
            int[] sentenceLengths = ReadBoundaryFile(boundary_file);

            // Read the classifier model:
            classifier = MaxEntPOSClassifier.LoadModel(model_file, classToClassId, featureToFeatureId);

            // Read the vectors:
            var testVectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Continuous);

            // Get the output ready for display.
            int[]    goldClassIds    = vectorFile.Headers[gold_i];
            int[]    instanceNameIds = vectorFile.Headers[instanceName_i];
            string[] instanceNames   = instanceNameToId.GetValues(instanceNameIds);

            // Generate sys_output:
            ConfusionMatrix confusionMatrix;

            File.WriteAllText(sys_output, GenerateSysOutput(instanceNames, testVectors, sentenceLengths, out confusionMatrix, gold_i));
            return(confusionMatrix.CalculateAccuracy());
        }
Exemplo n.º 3
0
        /// <summary>
        /// Loads the classifier provided by the <c>classifierFactory</c> which is modelled using the specified <c>model_file</c>.
        /// Then, the classifier is used to evaluate the accuracy of the <c>vector_data</c>.
        /// A report on the classification details is printed to the <c>output_file</c>.
        /// </summary>
        /// <param name="model_file">A file containing a serialization of the classifier model.</param>
        /// <param name="sys_output">A report on the classification details.</param>
        /// <param name="classifierFactory">Provides the necessary classifier.</param>
        internal static double ReportOnModel(
            FeatureVectorFile vectorFile
            , string sys_output
            , Func <TextIdMapper, TextIdMapper, Classifier> classifierFactory
            , Func <Classifier, List <FeatureVector>, TextIdMapper, TextIdMapper, string[]> getDetailsFunc
            )
        {
            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            Classifier classifier = classifierFactory(classToClassId, featureToFeatureId);

            var vectors     = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses = vectorFile.Headers[gold_i];

            var systemClasses = classifier.Classify(vectors);

            string[] details = getDetailsFunc(classifier, vectors, classToClassId, featureToFeatureId);

            var accuracy = ProgramOutput.GenerateSysOutput(sys_output, FileCreationMode.CreateNew, vectors, classToClassId, goldClasses, systemClasses, details, heading: Path.GetFileName(vectorFile.Path));

            return(accuracy);
        }
Exemplo n.º 4
0
        /// <summary>
        /// Trains the classifier provided by the <c>classifierFactory</c> on the <c>training_data</c>.
        /// Then, the classifier is used to evaluate the accuracy of both the <c>training_data</c> and <c>test_data</c>.
        /// A report on the classification details is printed to the <c>output_file</c>.
        /// </summary>
        /// <param name="output_file">A report on the classification details.</param>
        /// <param name="classifierFactory">Provides the necessary classifier.</param>
        internal static void ReportOnTrainingAndTesting(
            FeatureVectorFile vectorFile_train
            , FeatureVectorFile vectorFile_test
            , string output_file
            , Func <List <FeatureVector>, int, int, Classifier> classifierFactory
            , Func <Classifier, List <FeatureVector>, TextIdMapper, string[]> getDetailsFunc
            )
        {
            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var trainingVectors   = vectorFile_train.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_train = vectorFile_train.Headers[gold_i];

            var testVectors      = vectorFile_test.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_test = vectorFile_test.Headers[gold_i];

            Classifier classifier = classifierFactory(trainingVectors, classToClassId.Count, gold_i);

            var systemClasses_train = classifier.Classify(trainingVectors);
            var systemClasses_test  = classifier.Classify(testVectors);

            var details_train = ProgramOutput.GetDistributionDetails(classifier, trainingVectors, classToClassId);
            var details_test  = ProgramOutput.GetDistributionDetails(classifier, testVectors, classToClassId);

            ProgramOutput.GenerateSysOutput(output_file, FileCreationMode.CreateNew, trainingVectors, classToClassId, goldClasses_train, systemClasses_train, details_train, "training data");
            ProgramOutput.GenerateSysOutput(output_file, FileCreationMode.Append, testVectors, classToClassId, goldClasses_test, systemClasses_test, details_test, "test data");
        }
Exemplo n.º 5
0
        // Methods

        public override bool ExecuteCommand()
        {
            // Initialize the text-to-Id mappers:
            int gold_i = 0;

            featureToFeatureId = new TextIdMapper();
            classToClassId     = new TextIdMapper();
            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            // Workaround: Read everything from STDIN to a file. (Files are used as the text source throughout this application.)
            var svmLight_data = Console.In.ReadToEnd();

            Console.Error.WriteLine("{0} characters of input received.", svmLight_data.Length);
            string tempFile = Path.GetTempFileName();

            int[] goldClasses;
            List <FeatureVector> vectors;

            try
            {
                File.WriteAllText(tempFile, svmLight_data);
                FeatureVectorFile vectorFile = new FeatureVectorFile(path: tempFile, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: false);

                vectors     = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
                goldClasses = vectorFile.Headers[gold_i];
            }
            finally
            {
                File.Delete(tempFile);
            }
            Debug.Assert(vectors.Count > 0);

            IdValuePair <double>[] chiSquare = new IdValuePair <double> [featureToFeatureId.Count];
            //TODO: Make the implementation less binary dependent (i.e. the hardcoded 2 below).
            double[][,] contingencyTable_f = new double[featureToFeatureId.Count][, ];
            for (int f_i = 0; f_i < featureToFeatureId.Count; f_i++)
            {
                // Create a contingency table for this vector.
                contingencyTable_f[f_i] = new double[classToClassId.Count, 2];
                for (int v_i = 0; v_i < vectors.Count; v_i++)
                {
                    FeatureVector v = vectors[v_i];
                    contingencyTable_f[f_i][v.Headers[gold_i], (int)v.Features[f_i]]++;
                }
                chiSquare[f_i] = new IdValuePair <double>(f_i, StatisticsHelper.CalculateChiSquare(contingencyTable_f[f_i]));
            }
            ReportChiSquareResults(contingencyTable_f, chiSquare);
            return(true);
        }
Exemplo n.º 6
0
        // Methods

        public override double ExecuteCommand()
        {
            FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 1, featureDelimiter: ':', isSortRequired: false);

            // Initialize the text-to-Id mappers:
            featureToFeatureId = new TextIdMapper();
            int instanceName_i = 0;
            int gold_i         = 1;

            classToClassId = new TextIdMapper();
            var instanceNameToInstanceNameId = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[]
            {
                instanceNameToInstanceNameId
                , classToClassId
            };

            // Read the boundaries:
            int[] sentenceLengths = ReadBoundaryFile(boundary_file);

            // Read the classifier model:
            classifier = MaxEntPOSClassifier.LoadModel(model_file, classToClassId, featureToFeatureId);

            // Read the vectors:
            var testVectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Continuous);

            int[] instanceNameIds = vectorFile.Headers[instanceName_i];
            int[] goldClasses     = vectorFile.Headers[gold_i];

            // TODO: Neaten this up a little.
            string[] instanceNames = new string[instanceNameIds.Length];
            for (int i = 0; i < instanceNameIds.Length; i++)
            {
                int instanceNameId = instanceNameIds[i];
                instanceNames[i] = headerToHeaderIds[instanceName_i][i];
            }

            // Generate sys_output:
            var confusionMatrix = GenerateSysOutput(sys_output, instanceNames, testVectors, sentenceLengths, gold_i);

            return(confusionMatrix.CalculateAccuracy());
        }
        // Methods

        public override bool ExecuteCommand()
        {
            FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: false);

            // Load the training file.
            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var trainingVectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses     = vectorFile.Headers[gold_i];

            double[,] observation, expectation;
            CalculateObservationAndEmpiricalExpectation(trainingVectors, out observation, out expectation);

            OutputEmpiricalCount(observation, expectation);
            return(true);
        }
Exemplo n.º 8
0
        /// <summary>
        /// Trains a classifier on the specified <c>train_data</c>.
        /// Output the model to the specified <c>model_file</c>.
        /// </summary>
        /// <param name="model_file">A file containing a serialization of the classifier model.</param>
        /// <param name="classifierFactory">Provides the necessary classifier, which must implement ISaveModel.</param>
        internal static void TrainModel <T>(
            FeatureVectorFile vector_file
            , string model_file
            , Func <List <FeatureVector>, TextIdMapper, TextIdMapper, T> classifierFactory
            )
            where T : Classifier, ISaveModel
        {
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var vectors = vector_file.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);

            T classifier = classifierFactory(vectors, classToClassId, featureToFeatureId);

            //var systemClasses =
            classifier.Classify(vectors);

            classifier.SaveModel(model_file, classToClassId, featureToFeatureId);
        }
Exemplo n.º 9
0
        // Methods

        public override bool ExecuteCommand()
        {
            FeatureVectorFile vectorFile_train = new FeatureVectorFile(path: training_data_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: false);

            int          gold_i             = 0;
            TextIdMapper featureToFeatureId = new TextIdMapper();
            TextIdMapper classToClassId     = new TextIdMapper();

            TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId };

            var trainingVectors   = vectorFile_train.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary);
            var goldClasses_train = vectorFile_train.Headers[gold_i];

            // model_file is optional.
            Func <int, FeatureVector, double> calculate_Prob_c_v;

            // If it is not given, p(v|c_i) = 1/|C|, where |C| is the number of class_labels.
            if (string.IsNullOrWhiteSpace(model_file))
            {
                double kProbability = 1D / classToClassId.Count;
                calculate_Prob_c_v = (v, c_i) => { return(kProbability); };
            }
            // If it is given, it is used to calculate p(y|xi).
            else
            {
                MaxEntClassifier classifier = MaxEntClassifier.LoadModel(model_file, classToClassId, featureToFeatureId);
                calculate_Prob_c_v =
                    (c_i, v) =>
                {
                    double[] details;
                    int      sysClass = classifier.Classify(v, out details);
                    return(details[c_i]);
                };
            }

            double[,] expectation = CalculateModelExpectation(trainingVectors, calculate_Prob_c_v);

            OutputEmpiricalCount(expectation, trainingVectors.Count, requiresSort: true);
            return(true);
        }