// (0.5,0.5): // weight distribution = Gaussian(-0.02787, 0.2454) // error rate = 0.0527452589744697 = 1221/23149 // (1,1): // weight distribution = Gaussian(-0.03117, 0.3967) // error rate = 0.0522268780508877 = 1209/23149 // (2,2): // weight distribution = Gaussian(-0.03522, 0.6794) // error rate = 0.0530476478465593 = 1228/23149 // (10,10): // weight distribution = Gaussian(-0.05455, 2.96) // error rate = 0.0580586634411854 = 1344/23149 #if SUPPRESS_UNREACHABLE_CODE_WARNINGS #pragma warning restore 162 #endif public static void Rcv1Test2() { GaussianArray wPost; Gaussian biasPost; BinaryFormatter serializer = new BinaryFormatter(); //TODO: change path using (Stream stream = File.OpenRead(@"c:\Users\minka\Downloads\rcv1\weights.bin")) { wPost = (GaussianArray)serializer.Deserialize(stream); biasPost = (Gaussian)serializer.Deserialize(stream); } if (true) { GaussianEstimator est = new GaussianEstimator(); foreach (Gaussian item in wPost) { est.Add(item.GetMean()); } Console.WriteLine("weight distribution = {0}", est.GetDistribution(new Gaussian())); } var predict = new BpmPredict2(); predict.SetPriors(wPost, biasPost); int count = 0; int errors = 0; //TODO: change path foreach (Instance instance in new VwReader(@"c:\Users\minka\Downloads\rcv1\rcv1.test.vw.gz")) { bool yPred = predict.Predict(instance); if (yPred != instance.label) { errors++; } count++; } Console.WriteLine("error rate = {0} = {1}/{2}", (double)errors / count, errors, count); }
public static void Rcv1Test(double wVariance, double biasVariance) { int count = 0; if (false) { int maxFeatureIndex = 0; foreach (Instance instance in new VwReader(Path.Combine(dataFolder, "rcv1.train.vw.gz"))) { count++; if (count % 10000 == 0) { Console.WriteLine(count); } foreach (int index in instance.featureIndices) { if (index > maxFeatureIndex) { maxFeatureIndex = index; } } } Console.WriteLine("{0} features", maxFeatureIndex + 1); } int nf = 47152; var train = new BpmTrain2(); var predict = new BpmPredict2(); train.SetPriors(nf, wVariance, biasVariance); StreamWriter writer = new StreamWriter(Path.Combine(dataFolder, "log.txt")); int errors = 0; //int errors2 = 0; //StreamReader reader = new StreamReader(Path.Combine(dataFolder, "preds.txt"); // takes 92s to train // takes 74s just to read the data // takes 15s just to do 'wc' on the data // there are 781265 data points in train, 23149 in test foreach (Instance instance in new VwReader(Path.Combine(dataFolder, "rcv1.train.vw.gz"))) { predict.SetPriors(train.wPost, train.biasPost); bool yPred = predict.Predict(instance); if (yPred != instance.label) { errors++; } //double pred2 = double.Parse(reader.ReadLine()); //if ((pred2 > 0.5) != instance.label) errors2++; train.Train(instance); count++; if (count % 1000 == 0) { Console.WriteLine("{0} {1} {2}", count, (double)errors / count, train.biasPost); //Console.WriteLine("{0} {1} {2} {3}", count, (double)errors/count, (double)errors2/count, train.biasPost); writer.WriteLine("{0} {1}", count, (double)errors / count); writer.Flush(); //if (count == 10000) break; } } writer.Dispose(); #if NETFRAMEWORK // In the .NET 5.0 BinaryFormatter is obsolete // and would produce errors. This test code should be migrated. // See https://aka.ms/binaryformatter if (true) { BinaryFormatter serializer = new BinaryFormatter(); using (Stream stream = File.Create(Path.Combine(dataFolder, "weights.bin"))) { serializer.Serialize(stream, train.wPost); serializer.Serialize(stream, train.biasPost); } } #endif }
#pragma warning disable 162 #endif public static void Rcv1Test(double wVariance, double biasVariance) { int count = 0; if (false) { int maxFeatureIndex = 0; //TODO: change path foreach (Instance instance in new VwReader(@"c:\Users\minka\Downloads\rcv1\rcv1.train.vw.gz")) { count++; if (count % 10000 == 0) { Console.WriteLine(count); } foreach (int index in instance.featureIndices) { if (index > maxFeatureIndex) { maxFeatureIndex = index; } } } Console.WriteLine("{0} features", maxFeatureIndex + 1); } int nf = 47152; var train = new BpmTrain2(); var predict = new BpmPredict2(); train.SetPriors(nf, wVariance, biasVariance); //TODO: change path StreamWriter writer = new StreamWriter(@"c:\Users\minka\Downloads\rcv1\log.txt"); int errors = 0; //int errors2 = 0; //TODO: change path //StreamReader reader = new StreamReader(@"c:\Users\minka\Downloads\rcv1\preds.txt"); // takes 92s to train // takes 74s just to read the data // takes 15s just to do 'wc' on the data // there are 781265 data points in train, 23149 in test //TODO: change path foreach (Instance instance in new VwReader(@"c:\Users\minka\Downloads\rcv1\rcv1.train.vw.gz")) { predict.SetPriors(train.wPost, train.biasPost); bool yPred = predict.Predict(instance); if (yPred != instance.label) { errors++; } //double pred2 = double.Parse(reader.ReadLine()); //if ((pred2 > 0.5) != instance.label) errors2++; train.Train(instance); count++; if (count % 1000 == 0) { Console.WriteLine("{0} {1} {2}", count, (double)errors / count, train.biasPost); //Console.WriteLine("{0} {1} {2} {3}", count, (double)errors/count, (double)errors2/count, train.biasPost); writer.WriteLine("{0} {1}", count, (double)errors / count); writer.Flush(); //if (count == 10000) break; } } writer.Dispose(); if (true) { BinaryFormatter serializer = new BinaryFormatter(); //TODO: change path using (Stream stream = File.Create(@"c:\Users\minka\Downloads\rcv1\weights.bin")) { serializer.Serialize(stream, train.wPost); serializer.Serialize(stream, train.biasPost); } } }