Beispiel #1
0
        public static IEnumerable <OneHotRecordReadOnly> EnumerateBinLines(string path)
        {
            var fileStream = File.OpenRead(path);
            var reader     = new BinaryReader(fileStream);
            var lineNo     = 0;
            OneHotRecordReadOnly rec;

            while (true)
            {
                lineNo++;
                try
                {
                    rec = new OneHotRecordReadOnly(reader);
                }
                catch (EndOfStreamException ex)
                {
                    rec = null;
                }
                if (rec == null)
                {
                    break;
                }

                yield return(rec);

                if (lineNo % 1000000 == 0)
                {
                    Console.WriteLine("Line :  " + lineNo);
                }
            }
            fileStream.Close();
        }
Beispiel #2
0
        public static void MakeSubmission(Network network, string testsetPath, string targetPath)
        {
            Console.WriteLine("Making submission");
            var batchSize     = Constants.MINIBATCH_SIZE;
            var recs          = OneHotRecordReadOnly.LoadBinary(testsetPath);
            var sparseIndices = new int[batchSize][];
            var sparseValues  = new float[batchSize][];
            var labels        = new float[batchSize];
            var ids           = new int[batchSize];

            var submissionLines = new List <SubmissionLine>();

            var firstBatch = true;

            for (var recNo = 0; recNo < recs.Count; recNo++)
            {
                var record = recs[recNo];
                var label  = record.Label;
                var id     = record.Id;

                labels[recNo % batchSize] = label;
                ids[recNo % batchSize]    = id;
                record.CopyDataToSparseArray(sparseIndices, sparseValues, recNo % batchSize);

                if ((((recNo + 1) % batchSize) == 0) || (recNo == (recs.Count - 1)))
                {
                    network.InputLayer.SetSparseData(sparseValues.ToList(), sparseIndices.ToList());
                    network.LabelLayer.SetData(labels);
                    network.Calculate(train: false);
                    network.CostLayer.CopyToHost();
                    for (var i = 0; i < batchSize; i++)
                    {
                        if (ids[i] == 0)
                        {
                            continue;
                        }
                        var chance = network.CostLayer.Outputs[i, 1];
                        var line   = new SubmissionLine();
                        line.Id     = ids[i];
                        line.Chance = chance;
                        submissionLines.Add(line);
                    }

                    Array.Clear(ids, 0, ids.Length);
                }
                if (recNo % 100000 == 0)
                {
                    Console.WriteLine("line : " + recNo);
                }
            }
            SubmissionLine.SaveSubmission(targetPath, submissionLines);
        }
Beispiel #3
0
        public static List <OneHotRecordReadOnly> LoadBinary(string path, bool decompress = true)
        {
            Console.WriteLine("Loading records into memory");
            var           fileStream    = File.OpenRead(path);
            var           reader        = new BinaryReader(fileStream);
            DeflateStream deflateStream = null;

            if (decompress)
            {
                deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress);
                reader        = new BinaryReader(deflateStream);
            }


            var res    = new List <OneHotRecordReadOnly>();
            var lineNo = 0;

            while (true)
            {
                lineNo++;
                OneHotRecordReadOnly rec = null;
                try
                {
                    rec = new OneHotRecordReadOnly(reader);
                }
                catch (EndOfStreamException ex)
                {
                    rec = null;
                }

                if (rec == null)
                {
                    break;
                }

                res.Add(rec);
                if (lineNo % 1000000 == 0)
                {
                    Console.WriteLine("Line :  " + lineNo);
                }
            }

            if (decompress)
            {
                deflateStream.Close();
            }

            fileStream.Close();
            return(res);
        }
        public void LoadNextBatchSet()
        {
            if (_currentEpoch >= 0)
            {
                for (int setIdx = 0; setIdx < Constants.BATCHES_PER_SET; setIdx++)
                {
                    DataBatch batch;
                    if ((Batches == null) || (Batches.Count < (Constants.BATCHES_PER_SET)))
                    {
                        batch = new DataBatch(Constants.TOTAL_VALUE_COUNT, 1, sparse: true);
                        Batches.Add(batch);
                    }
                    else
                    {
                        batch = Batches[setIdx];
                    }

                    for (var minibatchIdx = 0; minibatchIdx < Constants.MINIBATCH_SIZE; minibatchIdx++)
                    {
                        OneHotRecordReadOnly rec = _records[_recNo];
                        rec.CopyDataToSparseArray(batch.SparseIndices, batch.SparseValues, minibatchIdx);
                        batch.Labels[minibatchIdx] = rec.Label;
                        _recNo++;
                        if (_recNo >= _records.Count)
                        {
                            _recNo = 0;
                            if (_totalBatchCount == 0)
                            {
                                _totalBatchCount = _batchesRead;
                            }
                            _batchesRead = 0;
                            _currentEpoch++;
                            CurrentEpochBatch = 0;
                            if (_shuffle)
                            {
                                _records.Shuffle();
                            }
                        }
                        _batchesRead++;
                    }
                }
            }

            _currentBatch = 0;
            CurrentSet++;
            _loaded = true;
        }
        public static IEnumerable<OneHotRecordReadOnly> EnumerateBinLines(string path)
        {
            var fileStream = File.OpenRead(path);
            var reader = new BinaryReader(fileStream);
            var lineNo = 0;
            OneHotRecordReadOnly rec;
            while (true)
            {
                lineNo++;
                try
                {
                    rec = new OneHotRecordReadOnly(reader);
                }
                catch (EndOfStreamException ex)
                {
                    rec = null;
                }
                if (rec == null) break;

                yield return rec;
                if (lineNo % 1000000 == 0) Console.WriteLine("Line :  " + lineNo);
            }
            fileStream.Close();
        }
Beispiel #6
0
        static void Main(string[] args)
        {
            var dataDir          = Directory.GetCurrentDirectory();
            var csvTrainPath     = Path.Combine(dataDir, "train.csv");
            var csvTestPath      = Path.Combine(dataDir, "test.csv");
            var binTrainPath     = Path.Combine(dataDir, "train_bin.bin");
            var binTestPath      = Path.Combine(dataDir, "test_bin.bin");
            var recodedTrainPath = Path.Combine(dataDir, "train_recoded.bin");
            var recodedTestPath  = Path.Combine(dataDir, "test_recoded.bin");
            var oneHotTrainPath  = Path.Combine(dataDir, "train_onehot.bin");
            var oneHotTestPath   = Path.Combine(dataDir, "test_onehot.bin");
            var scaledTrainPath  = Path.Combine(dataDir, "train_scaled.bin");
            var scaledTestPath   = Path.Combine(dataDir, "test_scaled.bin");

            Constants.HASH_SPACE_SIZE = 32768 * 2; // Like 'b' in vowpal but much smaller. We have less space on GPU and we need to multiply the space with the amount of nodes in the 1st layer
                                                   // When you change the value you need to preprocess again..
            Constants.InitOneHotIndices();

            // *** Remove processd files to reprocess ***

            // First process the CSV data into "zipped binary data" useful for when we have to reprocess. Faster and more compact..
            if (!File.Exists(binTrainPath))
            {
                PreprocessingRawValues.ConvertCSVToBinary(csvTrainPath, binTrainPath);
            }
            if (!File.Exists(binTestPath))
            {
                PreprocessingRawValues.ConvertCSVToBinary(csvTestPath, binTestPath);
            }

            // Recode categorical values. MISSING = missing, TRAINNOTTEST = in trainset, not testset, TESTNOTTRAIN = in testset, not trainset
            // LOWFREQUENCY = When a value occurs below a certain threshold, it is recoded to this value.
            if ((!File.Exists(recodedTrainPath)) || (!File.Exists(recodedTestPath)))
            {
                var frequencyFilter = Constants.FREQUENCY_FILTER_AGGRESSIVE; // Vary for ensembling, Medium or mild results in more featurevalues = more GPU mem usage, potentially better accuracy but also potentially overfitting. Make sure you also increase HASH_SIZE
                PreprocessingRawValues.RecodeCategoricalValues(binTrainPath, binTestPath, recodedTrainPath, recodedTestPath, frequencyFilter);
            }

            // Now One-Hot encode the raw records. (actually it one-hot encodes the categories with few values and hashes the categories with many values)
            // This is probably way too complicated. Perhaps we could hash everything. Even the numeric values.
            var encodeMissingValues  = true;                    // vary for ensembling
            var logTransformNumerics = true;                    // vary for ensembling
            var encodeTestNotTrainAs = Constants.VALUE_MISSING; // vary for ensembling

            if ((!File.Exists(oneHotTrainPath)) || (!File.Exists(oneHotTestPath)))
            {
                PreprocessingRawToOneHot.ConvertRawToOneHot(recodedTrainPath, recodedTestPath, oneHotTrainPath, oneHotTestPath, encodeMissingValues, encodeTestNotTrainAs, logTransformNumerics);
            }

            // Now scale the numeric values. This leads to faster convergence..
            if ((!File.Exists(scaledTrainPath)) || (!File.Exists(scaledTestPath)))
            {
                PreprocessingScale.ScaleNumericValues(oneHotTrainPath, oneHotTestPath, scaledTrainPath, scaledTestPath);
            }

            // We create an "ensemble" of a relunet and a maxout net.
            var gpuModule = new GPUModule();

            gpuModule.InitGPU();
            var learnRate = 0.03f;             // 0.01 - 0.04 also worked fine for me, 0.04 was the fastest.
            var momentum  = 0.5f;              // Did not play with this much since 1st layer is without momentum for performance reasons.
            var epochsBeforeMergeHoldout = 15; // When do we add the holdout set to the trainset (no more validation information after this)
            var totalEpochs = 20;              // How many epochs to train.. Usually I saw no improvement after 40

            var trainRecords = OneHotRecordReadOnly.LoadBinary(scaledTrainPath);

            // Train a maxout network (~LB 0.4556)
            var maxoutNet = CriteoNet.CreateNetworkMaxout(gpuModule, Constants.MINIBATCH_SIZE); // Example network that worked fine

            Train(gpuModule, trainRecords, maxoutNet, learnRate, momentum, epochsBeforeMergeHoldout, totalEpochs, tmpDir: dataDir);
            maxoutNet.SaveWeightsAndParams(dataDir, "maxoutnet_done");
            maxoutNet.Free();

            // Train a relu network (~LB 0.4555)
            var reluNet = CriteoNet.CreateNetworkRelu(gpuModule, Constants.MINIBATCH_SIZE); // Example network that worked fine

            Train(gpuModule, trainRecords, reluNet, learnRate, momentum, epochsBeforeMergeHoldout, totalEpochs, tmpDir: dataDir);
            reluNet.SaveWeightsAndParams(dataDir, "relunet_done");
            reluNet.Free();


            // Create the maxout submission (~LB 0.456, train longer for better scores)
            var submissionMaxoutNet  = CriteoNet.CreateNetworkMaxout(gpuModule, Constants.MINIBATCH_SIZE); // Example network that worked fine
            var submissionMaxoutPath = Path.Combine(dataDir, "submissionMaxout.csv");

            submissionMaxoutNet.LoadStructureWeightsAndParams(dataDir, "maxoutnet_done");
            MakeSubmission(submissionMaxoutNet, scaledTestPath, submissionMaxoutPath);

            // Create the relu submission (~LB 0.455, train longer for better scores)
            var submissionReluNet  = CriteoNet.CreateNetworkRelu(gpuModule, Constants.MINIBATCH_SIZE); // Example network that worked fine
            var submissionReluPath = Path.Combine(dataDir, "submissionRelu.csv");

            submissionReluNet.LoadStructureWeightsAndParams(dataDir, "relunet_done");
            MakeSubmission(submissionReluNet, scaledTestPath, submissionReluPath);

            // Now make a combined submission (~LB 0.45267)
            var submissionCombinedPath = Path.Combine(dataDir, "submissionCombined.csv");

            CombineSubmission(submissionCombinedPath, new string[] { submissionReluPath, submissionMaxoutPath });

            Console.WriteLine("Done press enter");
            Console.ReadLine();
        }
        public static List<OneHotRecordReadOnly> LoadBinary(string path, bool decompress = true)
        {
            Console.WriteLine("Loading records into memory");
            var fileStream = File.OpenRead(path);
            var reader = new BinaryReader(fileStream);
            DeflateStream deflateStream = null;
            if (decompress)
            {
                deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress);
                reader = new BinaryReader(deflateStream);
            }


            var res = new List<OneHotRecordReadOnly>();
            var lineNo = 0;
            while (true)
            {
                lineNo++;
                OneHotRecordReadOnly rec = null;
                try
                {

                    rec = new OneHotRecordReadOnly(reader);
                }
                catch (EndOfStreamException ex)
                {
                    rec = null;
                }

                if (rec == null) break;

                res.Add(rec);
                if (lineNo % 1000000 == 0) Console.WriteLine("Line :  " + lineNo);
            }

            if (decompress)
            {
                deflateStream.Close();
            }

            fileStream.Close();
            return res;
        }