private static void TestLinearASGD() { // http://leon.bottou.org/projects/sgd string codebookPath = "codebook.bin"; string x_train_fn = "x_train.txt.gz"; string x_test_fn = "x_test.txt.gz"; Sparse <double>[] xTrain = null, xTest = null; bool[] yTrain = null, yTest = null; // Check if we have the precomputed dataset on disk if (!File.Exists(x_train_fn) || !File.Exists(x_train_fn)) { Console.WriteLine("Downloading dataset"); RCV1v2 rcv1v2 = new RCV1v2(@"C:\Temp\"); // Note: Leon Bottou's SGD inverts training and // testing when benchmarking in this dataset var trainWords = rcv1v2.Testing.Item1; var testWords = rcv1v2.Training.Item1; string positiveClass = "CCAT"; yTrain = rcv1v2.Testing.Item2.Apply(x => x.Contains(positiveClass)); yTest = rcv1v2.Training.Item2.Apply(x => x.Contains(positiveClass)); TFIDF tfidf; if (!File.Exists(codebookPath)) { Console.WriteLine("Learning TD-IDF"); // Create a TF-IDF considering only words that // exist in both the training and testing sets tfidf = new TFIDF(testWords) { Tf = TermFrequency.Log, Idf = InverseDocumentFrequency.Default, }; // Learn the training set tfidf.Learn(trainWords); Console.WriteLine("Saving codebook"); tfidf.Save(codebookPath); } else { Console.WriteLine("Loading codebook"); Serializer.Load(codebookPath, out tfidf); } if (!File.Exists(x_train_fn)) { // Transform and normalize training set Console.WriteLine("Pre-processing training set"); xTrain = tfidf.Transform(trainWords, out xTrain); Console.WriteLine("Post-processing training set"); xTrain = xTrain.Divide(Norm.Euclidean(xTrain, dimension: 1), result: xTrain); Console.WriteLine("Saving training set to disk"); SparseFormat.Save(xTrain, yTrain, x_train_fn, compression: SerializerCompression.GZip); } if (!File.Exists(x_test_fn)) { // Transform and normalize testing set Console.WriteLine("Pre-processing testing set"); xTest = tfidf.Transform(testWords, out xTest); Console.WriteLine("Post-processing testing set"); xTest = xTest.Divide(Norm.Euclidean(xTest, dimension: 1), result: xTest); Console.WriteLine("Saving testing set to disk"); SparseFormat.Save(xTest, yTest, x_test_fn, compression: SerializerCompression.GZip); } } else { Console.WriteLine("Loading dataset from disk"); if (xTrain == null || yTrain == null) { SparseFormat.Load(x_train_fn, out xTrain, out yTrain, compression: SerializerCompression.GZip); } if (xTest == null || yTest == null) { SparseFormat.Load(x_test_fn, out xTest, out yTest, compression: SerializerCompression.GZip); } } int positiveTrain = yTrain.Count(x => x); int positiveTest = yTest.Count(x => x); int negativeTrain = yTrain.Length - positiveTrain; int negativeTest = yTest.Length - positiveTest; Console.WriteLine("Training samples: {0} [{1}+, {2}-]", positiveTrain + negativeTrain, positiveTrain, negativeTrain); Console.WriteLine("Negative samples: {0} [{1}+, {2}-]", positiveTest + negativeTest, positiveTest, negativeTest); // Create and learn a linear sparse binary support vector machine var learn = new AveragedStochasticGradientDescent <Linear, Sparse <double> >() { MaxIterations = 5, Tolerance = 0, }; Console.WriteLine("Learning training set"); Stopwatch sw = Stopwatch.StartNew(); var svm = learn.Learn(xTrain, yTrain); Console.WriteLine(sw.Elapsed); Console.WriteLine("Predicting training set"); sw = Stopwatch.StartNew(); bool[] trainPred = svm.Decide(xTrain); Console.WriteLine(sw.Elapsed); var train = new ConfusionMatrix(trainPred, yTrain); Console.WriteLine("Train acc: " + train.Accuracy); Console.WriteLine("Predicting testing set"); sw = Stopwatch.StartNew(); bool[] testPred = svm.Decide(xTest); Console.WriteLine(sw.Elapsed); var test = new ConfusionMatrix(testPred, yTest); Console.WriteLine("Test acc: " + test.Accuracy); }