public static IEnumerable <OneHotRecordReadOnly> EnumerateBinLines(string path) { var fileStream = File.OpenRead(path); var reader = new BinaryReader(fileStream); var lineNo = 0; OneHotRecordReadOnly rec; while (true) { lineNo++; try { rec = new OneHotRecordReadOnly(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) { break; } yield return(rec); if (lineNo % 1000000 == 0) { Console.WriteLine("Line : " + lineNo); } } fileStream.Close(); }
public static void MakeSubmission(Network network, string testsetPath, string targetPath) { Console.WriteLine("Making submission"); var batchSize = Constants.MINIBATCH_SIZE; var recs = OneHotRecordReadOnly.LoadBinary(testsetPath); var sparseIndices = new int[batchSize][]; var sparseValues = new float[batchSize][]; var labels = new float[batchSize]; var ids = new int[batchSize]; var submissionLines = new List <SubmissionLine>(); var firstBatch = true; for (var recNo = 0; recNo < recs.Count; recNo++) { var record = recs[recNo]; var label = record.Label; var id = record.Id; labels[recNo % batchSize] = label; ids[recNo % batchSize] = id; record.CopyDataToSparseArray(sparseIndices, sparseValues, recNo % batchSize); if ((((recNo + 1) % batchSize) == 0) || (recNo == (recs.Count - 1))) { network.InputLayer.SetSparseData(sparseValues.ToList(), sparseIndices.ToList()); network.LabelLayer.SetData(labels); network.Calculate(train: false); network.CostLayer.CopyToHost(); for (var i = 0; i < batchSize; i++) { if (ids[i] == 0) { continue; } var chance = network.CostLayer.Outputs[i, 1]; var line = new SubmissionLine(); line.Id = ids[i]; line.Chance = chance; submissionLines.Add(line); } Array.Clear(ids, 0, ids.Length); } if (recNo % 100000 == 0) { Console.WriteLine("line : " + recNo); } } SubmissionLine.SaveSubmission(targetPath, submissionLines); }
public static List <OneHotRecordReadOnly> LoadBinary(string path, bool decompress = true) { Console.WriteLine("Loading records into memory"); var fileStream = File.OpenRead(path); var reader = new BinaryReader(fileStream); DeflateStream deflateStream = null; if (decompress) { deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress); reader = new BinaryReader(deflateStream); } var res = new List <OneHotRecordReadOnly>(); var lineNo = 0; while (true) { lineNo++; OneHotRecordReadOnly rec = null; try { rec = new OneHotRecordReadOnly(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) { break; } res.Add(rec); if (lineNo % 1000000 == 0) { Console.WriteLine("Line : " + lineNo); } } if (decompress) { deflateStream.Close(); } fileStream.Close(); return(res); }
public void LoadNextBatchSet() { if (_currentEpoch >= 0) { for (int setIdx = 0; setIdx < Constants.BATCHES_PER_SET; setIdx++) { DataBatch batch; if ((Batches == null) || (Batches.Count < (Constants.BATCHES_PER_SET))) { batch = new DataBatch(Constants.TOTAL_VALUE_COUNT, 1, sparse: true); Batches.Add(batch); } else { batch = Batches[setIdx]; } for (var minibatchIdx = 0; minibatchIdx < Constants.MINIBATCH_SIZE; minibatchIdx++) { OneHotRecordReadOnly rec = _records[_recNo]; rec.CopyDataToSparseArray(batch.SparseIndices, batch.SparseValues, minibatchIdx); batch.Labels[minibatchIdx] = rec.Label; _recNo++; if (_recNo >= _records.Count) { _recNo = 0; if (_totalBatchCount == 0) { _totalBatchCount = _batchesRead; } _batchesRead = 0; _currentEpoch++; CurrentEpochBatch = 0; if (_shuffle) { _records.Shuffle(); } } _batchesRead++; } } } _currentBatch = 0; CurrentSet++; _loaded = true; }
public static IEnumerable<OneHotRecordReadOnly> EnumerateBinLines(string path) { var fileStream = File.OpenRead(path); var reader = new BinaryReader(fileStream); var lineNo = 0; OneHotRecordReadOnly rec; while (true) { lineNo++; try { rec = new OneHotRecordReadOnly(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) break; yield return rec; if (lineNo % 1000000 == 0) Console.WriteLine("Line : " + lineNo); } fileStream.Close(); }
static void Main(string[] args) { var dataDir = Directory.GetCurrentDirectory(); var csvTrainPath = Path.Combine(dataDir, "train.csv"); var csvTestPath = Path.Combine(dataDir, "test.csv"); var binTrainPath = Path.Combine(dataDir, "train_bin.bin"); var binTestPath = Path.Combine(dataDir, "test_bin.bin"); var recodedTrainPath = Path.Combine(dataDir, "train_recoded.bin"); var recodedTestPath = Path.Combine(dataDir, "test_recoded.bin"); var oneHotTrainPath = Path.Combine(dataDir, "train_onehot.bin"); var oneHotTestPath = Path.Combine(dataDir, "test_onehot.bin"); var scaledTrainPath = Path.Combine(dataDir, "train_scaled.bin"); var scaledTestPath = Path.Combine(dataDir, "test_scaled.bin"); Constants.HASH_SPACE_SIZE = 32768 * 2; // Like 'b' in vowpal but much smaller. We have less space on GPU and we need to multiply the space with the amount of nodes in the 1st layer // When you change the value you need to preprocess again.. Constants.InitOneHotIndices(); // *** Remove processd files to reprocess *** // First process the CSV data into "zipped binary data" useful for when we have to reprocess. Faster and more compact.. if (!File.Exists(binTrainPath)) { PreprocessingRawValues.ConvertCSVToBinary(csvTrainPath, binTrainPath); } if (!File.Exists(binTestPath)) { PreprocessingRawValues.ConvertCSVToBinary(csvTestPath, binTestPath); } // Recode categorical values. MISSING = missing, TRAINNOTTEST = in trainset, not testset, TESTNOTTRAIN = in testset, not trainset // LOWFREQUENCY = When a value occurs below a certain threshold, it is recoded to this value. if ((!File.Exists(recodedTrainPath)) || (!File.Exists(recodedTestPath))) { var frequencyFilter = Constants.FREQUENCY_FILTER_AGGRESSIVE; // Vary for ensembling, Medium or mild results in more featurevalues = more GPU mem usage, potentially better accuracy but also potentially overfitting. Make sure you also increase HASH_SIZE PreprocessingRawValues.RecodeCategoricalValues(binTrainPath, binTestPath, recodedTrainPath, recodedTestPath, frequencyFilter); } // Now One-Hot encode the raw records. (actually it one-hot encodes the categories with few values and hashes the categories with many values) // This is probably way too complicated. Perhaps we could hash everything. Even the numeric values. var encodeMissingValues = true; // vary for ensembling var logTransformNumerics = true; // vary for ensembling var encodeTestNotTrainAs = Constants.VALUE_MISSING; // vary for ensembling if ((!File.Exists(oneHotTrainPath)) || (!File.Exists(oneHotTestPath))) { PreprocessingRawToOneHot.ConvertRawToOneHot(recodedTrainPath, recodedTestPath, oneHotTrainPath, oneHotTestPath, encodeMissingValues, encodeTestNotTrainAs, logTransformNumerics); } // Now scale the numeric values. This leads to faster convergence.. if ((!File.Exists(scaledTrainPath)) || (!File.Exists(scaledTestPath))) { PreprocessingScale.ScaleNumericValues(oneHotTrainPath, oneHotTestPath, scaledTrainPath, scaledTestPath); } // We create an "ensemble" of a relunet and a maxout net. var gpuModule = new GPUModule(); gpuModule.InitGPU(); var learnRate = 0.03f; // 0.01 - 0.04 also worked fine for me, 0.04 was the fastest. var momentum = 0.5f; // Did not play with this much since 1st layer is without momentum for performance reasons. var epochsBeforeMergeHoldout = 15; // When do we add the holdout set to the trainset (no more validation information after this) var totalEpochs = 20; // How many epochs to train.. Usually I saw no improvement after 40 var trainRecords = OneHotRecordReadOnly.LoadBinary(scaledTrainPath); // Train a maxout network (~LB 0.4556) var maxoutNet = CriteoNet.CreateNetworkMaxout(gpuModule, Constants.MINIBATCH_SIZE); // Example network that worked fine Train(gpuModule, trainRecords, maxoutNet, learnRate, momentum, epochsBeforeMergeHoldout, totalEpochs, tmpDir: dataDir); maxoutNet.SaveWeightsAndParams(dataDir, "maxoutnet_done"); maxoutNet.Free(); // Train a relu network (~LB 0.4555) var reluNet = CriteoNet.CreateNetworkRelu(gpuModule, Constants.MINIBATCH_SIZE); // Example network that worked fine Train(gpuModule, trainRecords, reluNet, learnRate, momentum, epochsBeforeMergeHoldout, totalEpochs, tmpDir: dataDir); reluNet.SaveWeightsAndParams(dataDir, "relunet_done"); reluNet.Free(); // Create the maxout submission (~LB 0.456, train longer for better scores) var submissionMaxoutNet = CriteoNet.CreateNetworkMaxout(gpuModule, Constants.MINIBATCH_SIZE); // Example network that worked fine var submissionMaxoutPath = Path.Combine(dataDir, "submissionMaxout.csv"); submissionMaxoutNet.LoadStructureWeightsAndParams(dataDir, "maxoutnet_done"); MakeSubmission(submissionMaxoutNet, scaledTestPath, submissionMaxoutPath); // Create the relu submission (~LB 0.455, train longer for better scores) var submissionReluNet = CriteoNet.CreateNetworkRelu(gpuModule, Constants.MINIBATCH_SIZE); // Example network that worked fine var submissionReluPath = Path.Combine(dataDir, "submissionRelu.csv"); submissionReluNet.LoadStructureWeightsAndParams(dataDir, "relunet_done"); MakeSubmission(submissionReluNet, scaledTestPath, submissionReluPath); // Now make a combined submission (~LB 0.45267) var submissionCombinedPath = Path.Combine(dataDir, "submissionCombined.csv"); CombineSubmission(submissionCombinedPath, new string[] { submissionReluPath, submissionMaxoutPath }); Console.WriteLine("Done press enter"); Console.ReadLine(); }
public static List<OneHotRecordReadOnly> LoadBinary(string path, bool decompress = true) { Console.WriteLine("Loading records into memory"); var fileStream = File.OpenRead(path); var reader = new BinaryReader(fileStream); DeflateStream deflateStream = null; if (decompress) { deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress); reader = new BinaryReader(deflateStream); } var res = new List<OneHotRecordReadOnly>(); var lineNo = 0; while (true) { lineNo++; OneHotRecordReadOnly rec = null; try { rec = new OneHotRecordReadOnly(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) break; res.Add(rec); if (lineNo % 1000000 == 0) Console.WriteLine("Line : " + lineNo); } if (decompress) { deflateStream.Close(); } fileStream.Close(); return res; }