/// <summary> /// Writes each internal array of the provided sparse matrix to a different file. All these filenames have a common /// prefix, extracted from <paramref name="pathBase"/>, and a suffix corresponding to the purpose of each array. /// </summary> /// <param name="matrix">The sparse matrix to write.</param> /// <param name="pathBase">An absolute path. This filename will not be used directly. Instead one file per internal array /// of <paramref name="matrix"/> will be used. Each file will be suffixed with the name of that array and then the /// extension specified in <paramref name="pathBase"/>.</param> /// <param name="writeArrayLengthFirst">If true, the first line of each file will contain the length of the corresponding /// array. If false, there will be only one line per file, which will contain the array.</param> public void WriteToMultipleFiles(ISparseMatrix matrix, string pathBase, bool writeArrayLengthFirst = true) // TODO: this should be a different writer { string path = Path.GetDirectoryName(pathBase); string nameOnly = Path.GetFileNameWithoutExtension(pathBase); string ext = Path.GetExtension(pathBase); SparseFormat sparseFormat = matrix.GetSparseFormat(); // Values array string suffix = "-" + sparseFormat.RawValuesTitle.ToLower(); string valuesPath = path + "\\" + nameOnly + suffix + ext; // Not too sure about the \\ using (var writer = new StreamWriter(valuesPath)) { #if DEBUG writer.AutoFlush = true; // To look at intermediate output at certain breakpoints #endif WriteArray(sparseFormat.RawValuesArray, writer, writeArrayLengthFirst); } // Indexing arrays foreach (var nameArrayPair in sparseFormat.RawIndexArrays) { suffix = "-" + nameArrayPair.Key.ToLower(); string indexerPath = path + "\\" + nameOnly + suffix + ext; // Not too sure about the \\ using (var writer = new StreamWriter(indexerPath)) { #if DEBUG writer.AutoFlush = true; // To look at intermediate output at certain breakpoints #endif WriteArray(nameArrayPair.Value, writer, writeArrayLengthFirst); } } }
public void WriteBoolTest_compressed() { double[][] samples = new[] { new double[] { 1, 2, 0, 3, 0 }, new double[] { 6, 0, 4, 2, 0 }, new double[] { 0, 0, 0, 0, 0 }, }; bool[] outputs = { false, true, false }; SparseFormat.Save(samples, outputs, "test.txt.gz", compression: SerializerCompression.GZip); Sparse <double>[] newSamples; bool[] newOutput; SparseFormat.Load("test.txt.gz", out newSamples, out newOutput, compression: SerializerCompression.GZip); SparseFormat.Save(newSamples, newOutput, "test2.txt"); string actual = File.ReadAllText("test2.txt"); string expected = @"-1 1:1 2:2 4:3 1 1:6 3:4 4:2 -1 "; Assert.AreEqual(expected, actual); }
private void WriteToStream(ISparseMatrix matrix, StreamWriter writer) { SparseFormat sparseFormat = matrix.GetSparseFormat(); writer.Write(sparseFormat.RawValuesTitle + ": "); if (titlesOnOtherLines) { writer.WriteLine(); } WriteArray(sparseFormat.RawValuesArray, writer, false); foreach (var nameArrayPair in sparseFormat.RawIndexArrays) { if (lineBetweenArrays) { writer.WriteLine(); } writer.WriteLine(); // otherwise everything would be on the same line writer.Write(nameArrayPair.Key + ": "); if (titlesOnOtherLines) { writer.WriteLine(); } WriteArray(nameArrayPair.Value, writer, false); } }
public void WriteBoolTest_compressed() { double[][] samples = new[] { new double[] { 1, 2, 0, 3, 0 }, new double[] { 6, 0, 4, 2, 0 }, new double[] { 0, 0, 0, 0, 0 }, }; bool[] outputs = { false, true, false }; SparseFormat.Save(samples, outputs, test_txt_gz, compression: SerializerCompression.GZip); Sparse <double>[] newSamples; bool[] newOutput; SparseFormat.Load(test_txt_gz, out newSamples, out newOutput, compression: SerializerCompression.GZip); SparseFormat.Save(newSamples, newOutput, Path.Combine(TestContext.CurrentContext.TestDirectory, "test2.txt")); string actual = File.ReadAllText(test2_txt); string expected = @"-1 1:1 2:2 4:3 1 1:6 3:4 4:2 -1 "; expected = expected.Replace("\r\n", Environment.NewLine); Assert.AreEqual(expected, actual); }
/// <summary> /// See <see cref="ISparseMatrix.GetSparseFormat"/>. /// </summary> public SparseFormat GetSparseFormat() { var format = new SparseFormat(); format.RawValuesTitle = "Values"; format.RawValuesArray = values; format.RawIndexArrays.Add("Row indices", rowIndices); format.RawIndexArrays.Add("Column offsets", colOffsets); return(format); }
public void WriteBoolTest() { double[][] samples = new[] { new double[] { 1, 2, 0, 3, 0 }, new double[] { 6, 0, 4, 2, 0 }, new double[] { 0, 0, 0, 0, 0 }, }; bool[] outputs = { false, true, false }; SparseFormat.Save(samples, outputs, "test.txt"); string actual = File.ReadAllText("test.txt"); string expected = @"-1 1:1 2:2 4:3 1 1:6 3:4 4:2 -1 "; Assert.AreEqual(expected, actual); }
public void WriteBoolTest() { double[][] samples = new[] { new double[] { 1, 2, 0, 3, 0 }, new double[] { 6, 0, 4, 2, 0 }, new double[] { 0, 0, 0, 0, 0 }, }; bool[] outputs = { false, true, false }; SparseFormat.Save(samples, outputs, test_txt); string actual = File.ReadAllText(test_txt); string expected = @"-1 1:1 2:2 4:3 1 1:6 3:4 4:2 -1 "; expected = expected.Replace("\r\n", Environment.NewLine); Assert.AreEqual(expected, actual); }
private static void TestLinearASGD() { // http://leon.bottou.org/projects/sgd string codebookPath = "codebook.bin"; string x_train_fn = "x_train.txt.gz"; string x_test_fn = "x_test.txt.gz"; Sparse <double>[] xTrain = null, xTest = null; bool[] yTrain = null, yTest = null; // Check if we have the precomputed dataset on disk if (!File.Exists(x_train_fn) || !File.Exists(x_train_fn)) { Console.WriteLine("Downloading dataset"); RCV1v2 rcv1v2 = new RCV1v2(@"C:\Temp\"); // Note: Leon Bottou's SGD inverts training and // testing when benchmarking in this dataset var trainWords = rcv1v2.Testing.Item1; var testWords = rcv1v2.Training.Item1; string positiveClass = "CCAT"; yTrain = rcv1v2.Testing.Item2.Apply(x => x.Contains(positiveClass)); yTest = rcv1v2.Training.Item2.Apply(x => x.Contains(positiveClass)); TFIDF tfidf; if (!File.Exists(codebookPath)) { Console.WriteLine("Learning TD-IDF"); // Create a TF-IDF considering only words that // exist in both the training and testing sets tfidf = new TFIDF(testWords) { Tf = TermFrequency.Log, Idf = InverseDocumentFrequency.Default, }; // Learn the training set tfidf.Learn(trainWords); Console.WriteLine("Saving codebook"); tfidf.Save(codebookPath); } else { Console.WriteLine("Loading codebook"); Serializer.Load(codebookPath, out tfidf); } if (!File.Exists(x_train_fn)) { // Transform and normalize training set Console.WriteLine("Pre-processing training set"); xTrain = tfidf.Transform(trainWords, out xTrain); Console.WriteLine("Post-processing training set"); xTrain = xTrain.Divide(Norm.Euclidean(xTrain, dimension: 1), result: xTrain); Console.WriteLine("Saving training set to disk"); SparseFormat.Save(xTrain, yTrain, x_train_fn, compression: SerializerCompression.GZip); } if (!File.Exists(x_test_fn)) { // Transform and normalize testing set Console.WriteLine("Pre-processing testing set"); xTest = tfidf.Transform(testWords, out xTest); Console.WriteLine("Post-processing testing set"); xTest = xTest.Divide(Norm.Euclidean(xTest, dimension: 1), result: xTest); Console.WriteLine("Saving testing set to disk"); SparseFormat.Save(xTest, yTest, x_test_fn, compression: SerializerCompression.GZip); } } else { Console.WriteLine("Loading dataset from disk"); if (xTrain == null || yTrain == null) { SparseFormat.Load(x_train_fn, out xTrain, out yTrain, compression: SerializerCompression.GZip); } if (xTest == null || yTest == null) { SparseFormat.Load(x_test_fn, out xTest, out yTest, compression: SerializerCompression.GZip); } } int positiveTrain = yTrain.Count(x => x); int positiveTest = yTest.Count(x => x); int negativeTrain = yTrain.Length - positiveTrain; int negativeTest = yTest.Length - positiveTest; Console.WriteLine("Training samples: {0} [{1}+, {2}-]", positiveTrain + negativeTrain, positiveTrain, negativeTrain); Console.WriteLine("Negative samples: {0} [{1}+, {2}-]", positiveTest + negativeTest, positiveTest, negativeTest); // Create and learn a linear sparse binary support vector machine var learn = new AveragedStochasticGradientDescent <Linear, Sparse <double> >() { MaxIterations = 5, Tolerance = 0, }; Console.WriteLine("Learning training set"); Stopwatch sw = Stopwatch.StartNew(); var svm = learn.Learn(xTrain, yTrain); Console.WriteLine(sw.Elapsed); Console.WriteLine("Predicting training set"); sw = Stopwatch.StartNew(); bool[] trainPred = svm.Decide(xTrain); Console.WriteLine(sw.Elapsed); var train = new ConfusionMatrix(trainPred, yTrain); Console.WriteLine("Train acc: " + train.Accuracy); Console.WriteLine("Predicting testing set"); sw = Stopwatch.StartNew(); bool[] testPred = svm.Decide(xTest); Console.WriteLine(sw.Elapsed); var test = new ConfusionMatrix(testPred, yTest); Console.WriteLine("Test acc: " + test.Accuracy); }