Пример #1
0
        /// <summary>
        /// Writes each internal array of the provided sparse matrix to a different file. All these filenames have a common
        /// prefix, extracted from <paramref name="pathBase"/>, and a suffix corresponding to the purpose of each array.
        /// </summary>
        /// <param name="matrix">The sparse matrix to write.</param>
        /// <param name="pathBase">An absolute path. This filename will not be used directly. Instead one file per internal array
        ///     of <paramref name="matrix"/> will be used. Each file will be suffixed with the name of that array and then the
        ///     extension specified in <paramref name="pathBase"/>.</param>
        /// <param name="writeArrayLengthFirst">If true, the first line of each file will contain the length of the corresponding
        ///     array. If false, there will be only one line per file, which will contain the array.</param>
        public void WriteToMultipleFiles(ISparseMatrix matrix, string pathBase, bool writeArrayLengthFirst = true) // TODO: this should be a different writer
        {
            string       path         = Path.GetDirectoryName(pathBase);
            string       nameOnly     = Path.GetFileNameWithoutExtension(pathBase);
            string       ext          = Path.GetExtension(pathBase);
            SparseFormat sparseFormat = matrix.GetSparseFormat();

            // Values array
            string suffix     = "-" + sparseFormat.RawValuesTitle.ToLower();
            string valuesPath = path + "\\" + nameOnly + suffix + ext; // Not too sure about the \\

            using (var writer = new StreamWriter(valuesPath))
            {
#if DEBUG
                writer.AutoFlush = true; // To look at intermediate output at certain breakpoints
#endif
                WriteArray(sparseFormat.RawValuesArray, writer, writeArrayLengthFirst);
            }

            // Indexing arrays
            foreach (var nameArrayPair in sparseFormat.RawIndexArrays)
            {
                suffix = "-" + nameArrayPair.Key.ToLower();
                string indexerPath = path + "\\" + nameOnly + suffix + ext; // Not too sure about the \\
                using (var writer = new StreamWriter(indexerPath))
                {
#if DEBUG
                    writer.AutoFlush = true; // To look at intermediate output at certain breakpoints
#endif
                    WriteArray(nameArrayPair.Value, writer, writeArrayLengthFirst);
                }
            }
        }
Пример #2
0
        public void WriteBoolTest_compressed()
        {
            double[][] samples = new[]
            {
                new double[] { 1, 2, 0, 3, 0 },
                new double[] { 6, 0, 4, 2, 0 },
                new double[] { 0, 0, 0, 0, 0 },
            };

            bool[] outputs = { false, true, false };

            SparseFormat.Save(samples, outputs, "test.txt.gz", compression: SerializerCompression.GZip);

            Sparse <double>[] newSamples;
            bool[]            newOutput;
            SparseFormat.Load("test.txt.gz", out newSamples, out newOutput, compression: SerializerCompression.GZip);


            SparseFormat.Save(newSamples, newOutput, "test2.txt");

            string actual   = File.ReadAllText("test2.txt");
            string expected = @"-1 1:1 2:2 4:3
1 1:6 3:4 4:2
-1 
";

            Assert.AreEqual(expected, actual);
        }
Пример #3
0
        private void WriteToStream(ISparseMatrix matrix, StreamWriter writer)
        {
            SparseFormat sparseFormat = matrix.GetSparseFormat();

            writer.Write(sparseFormat.RawValuesTitle + ": ");
            if (titlesOnOtherLines)
            {
                writer.WriteLine();
            }
            WriteArray(sparseFormat.RawValuesArray, writer, false);

            foreach (var nameArrayPair in sparseFormat.RawIndexArrays)
            {
                if (lineBetweenArrays)
                {
                    writer.WriteLine();
                }
                writer.WriteLine(); // otherwise everything would be on the same line
                writer.Write(nameArrayPair.Key + ": ");
                if (titlesOnOtherLines)
                {
                    writer.WriteLine();
                }
                WriteArray(nameArrayPair.Value, writer, false);
            }
        }
Пример #4
0
        public void WriteBoolTest_compressed()
        {
            double[][] samples = new[]
            {
                new double[] { 1, 2, 0, 3, 0 },
                new double[] { 6, 0, 4, 2, 0 },
                new double[] { 0, 0, 0, 0, 0 },
            };

            bool[] outputs = { false, true, false };

            SparseFormat.Save(samples, outputs, test_txt_gz, compression: SerializerCompression.GZip);

            Sparse <double>[] newSamples;
            bool[]            newOutput;
            SparseFormat.Load(test_txt_gz, out newSamples, out newOutput, compression: SerializerCompression.GZip);


            SparseFormat.Save(newSamples, newOutput, Path.Combine(TestContext.CurrentContext.TestDirectory, "test2.txt"));

            string actual   = File.ReadAllText(test2_txt);
            string expected = @"-1 1:1 2:2 4:3
1 1:6 3:4 4:2
-1 
";

            expected = expected.Replace("\r\n", Environment.NewLine);

            Assert.AreEqual(expected, actual);
        }
Пример #5
0
        /// <summary>
        /// See <see cref="ISparseMatrix.GetSparseFormat"/>.
        /// </summary>
        public SparseFormat GetSparseFormat()
        {
            var format = new SparseFormat();

            format.RawValuesTitle = "Values";
            format.RawValuesArray = values;
            format.RawIndexArrays.Add("Row indices", rowIndices);
            format.RawIndexArrays.Add("Column offsets", colOffsets);
            return(format);
        }
Пример #6
0
        public void WriteBoolTest()
        {
            double[][] samples = new[]
            {
                new double[] { 1, 2, 0, 3, 0 },
                new double[] { 6, 0, 4, 2, 0 },
                new double[] { 0, 0, 0, 0, 0 },
            };

            bool[] outputs = { false, true, false };

            SparseFormat.Save(samples, outputs, "test.txt");

            string actual   = File.ReadAllText("test.txt");
            string expected = @"-1 1:1 2:2 4:3
1 1:6 3:4 4:2
-1 
";

            Assert.AreEqual(expected, actual);
        }
Пример #7
0
        public void WriteBoolTest()
        {
            double[][] samples = new[]
            {
                new double[] { 1, 2, 0, 3, 0 },
                new double[] { 6, 0, 4, 2, 0 },
                new double[] { 0, 0, 0, 0, 0 },
            };

            bool[] outputs = { false, true, false };

            SparseFormat.Save(samples, outputs, test_txt);

            string actual   = File.ReadAllText(test_txt);
            string expected = @"-1 1:1 2:2 4:3
1 1:6 3:4 4:2
-1 
";

            expected = expected.Replace("\r\n", Environment.NewLine);

            Assert.AreEqual(expected, actual);
        }
Пример #8
0
        private static void TestLinearASGD()
        {
            // http://leon.bottou.org/projects/sgd

            string codebookPath = "codebook.bin";
            string x_train_fn   = "x_train.txt.gz";
            string x_test_fn    = "x_test.txt.gz";

            Sparse <double>[] xTrain = null, xTest = null;
            bool[]            yTrain = null, yTest = null;

            // Check if we have the precomputed dataset on disk
            if (!File.Exists(x_train_fn) || !File.Exists(x_train_fn))
            {
                Console.WriteLine("Downloading dataset");
                RCV1v2 rcv1v2 = new RCV1v2(@"C:\Temp\");

                // Note: Leon Bottou's SGD inverts training and
                // testing when benchmarking in this dataset
                var trainWords = rcv1v2.Testing.Item1;
                var testWords  = rcv1v2.Training.Item1;

                string positiveClass = "CCAT";
                yTrain = rcv1v2.Testing.Item2.Apply(x => x.Contains(positiveClass));
                yTest  = rcv1v2.Training.Item2.Apply(x => x.Contains(positiveClass));

                TFIDF tfidf;
                if (!File.Exists(codebookPath))
                {
                    Console.WriteLine("Learning TD-IDF");
                    // Create a TF-IDF considering only words that
                    // exist in both the training and testing sets
                    tfidf = new TFIDF(testWords)
                    {
                        Tf  = TermFrequency.Log,
                        Idf = InverseDocumentFrequency.Default,
                    };

                    // Learn the training set
                    tfidf.Learn(trainWords);

                    Console.WriteLine("Saving codebook");
                    tfidf.Save(codebookPath);
                }
                else
                {
                    Console.WriteLine("Loading codebook");
                    Serializer.Load(codebookPath, out tfidf);
                }

                if (!File.Exists(x_train_fn))
                {
                    // Transform and normalize training set
                    Console.WriteLine("Pre-processing training set");
                    xTrain = tfidf.Transform(trainWords, out xTrain);

                    Console.WriteLine("Post-processing training set");
                    xTrain = xTrain.Divide(Norm.Euclidean(xTrain, dimension: 1), result: xTrain);

                    Console.WriteLine("Saving training set to disk");
                    SparseFormat.Save(xTrain, yTrain, x_train_fn, compression: SerializerCompression.GZip);
                }

                if (!File.Exists(x_test_fn))
                {
                    // Transform and normalize testing set
                    Console.WriteLine("Pre-processing testing set");
                    xTest = tfidf.Transform(testWords, out xTest);

                    Console.WriteLine("Post-processing testing set");
                    xTest = xTest.Divide(Norm.Euclidean(xTest, dimension: 1), result: xTest);

                    Console.WriteLine("Saving testing set to disk");
                    SparseFormat.Save(xTest, yTest, x_test_fn, compression: SerializerCompression.GZip);
                }
            }
            else
            {
                Console.WriteLine("Loading dataset from disk");
                if (xTrain == null || yTrain == null)
                {
                    SparseFormat.Load(x_train_fn, out xTrain, out yTrain, compression: SerializerCompression.GZip);
                }
                if (xTest == null || yTest == null)
                {
                    SparseFormat.Load(x_test_fn, out xTest, out yTest, compression: SerializerCompression.GZip);
                }
            }

            int positiveTrain = yTrain.Count(x => x);
            int positiveTest  = yTest.Count(x => x);
            int negativeTrain = yTrain.Length - positiveTrain;
            int negativeTest  = yTest.Length - positiveTest;

            Console.WriteLine("Training samples: {0} [{1}+, {2}-]", positiveTrain + negativeTrain, positiveTrain, negativeTrain);
            Console.WriteLine("Negative samples: {0} [{1}+, {2}-]", positiveTest + negativeTest, positiveTest, negativeTest);

            // Create and learn a linear sparse binary support vector machine
            var learn = new AveragedStochasticGradientDescent <Linear, Sparse <double> >()
            {
                MaxIterations = 5,
                Tolerance     = 0,
            };

            Console.WriteLine("Learning training set");
            Stopwatch sw  = Stopwatch.StartNew();
            var       svm = learn.Learn(xTrain, yTrain);

            Console.WriteLine(sw.Elapsed);


            Console.WriteLine("Predicting training set");
            sw = Stopwatch.StartNew();
            bool[] trainPred = svm.Decide(xTrain);
            Console.WriteLine(sw.Elapsed);

            var train = new ConfusionMatrix(trainPred, yTrain);

            Console.WriteLine("Train acc: " + train.Accuracy);


            Console.WriteLine("Predicting testing set");
            sw = Stopwatch.StartNew();
            bool[] testPred = svm.Decide(xTest);
            Console.WriteLine(sw.Elapsed);

            var test = new ConfusionMatrix(testPred, yTest);

            Console.WriteLine("Test acc: " + test.Accuracy);
        }