protected static void ConvertRawToOneHot(string rawSrcPath, string oneHotDstPath, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false)
        {
            _hashIndices       = new Dictionary <string, int>();
            _categoricalIdices = new Dictionary <int, Dictionary <int, int> >();

            if (File.Exists(oneHotDstPath))
            {
                File.Delete(oneHotDstPath);
            }
            var stream = File.OpenWrite(oneHotDstPath);
            var writer = new BinaryWriter(stream);

            var label = (isTestSet) ? "test" : "train";

            Console.WriteLine("Converting " + label + " records");

            var recNo = 0;

            foreach (var raw in RawRecord.EnumerateBinLines(rawSrcPath))
            {
                var click = ConvertRecord(raw, recNo, isTestSet);
                click.WriteBinary(writer);
                recNo++;
            }

            writer.Flush();
            stream.Close();
        }
Beispiel #2
0
        public static float[] GetStdDevs(float[] means, params string[] srcPaths)
        {
            var counts   = new int[NUMERIC_COUNT];
            var squareds = new double[NUMERIC_COUNT];
            var res      = new float[NUMERIC_COUNT];

            foreach (var srcPath in srcPaths)
            {
                foreach (var src in RawRecord.EnumerateBinLines(srcPath))
                {
                    for (var i = 0; i < NUMERIC_COUNT; i++)
                    {
                        var val = (double)src.Values[i];
                        if (val > 0)
                        {
                            var diff    = val - (double)means[i];
                            var squared = diff * diff;
                            squareds[i] += squared;
                            counts[i]   += 1;
                        }
                    }
                }
            }
            for (var i = 0; i < NUMERIC_COUNT; i++)
            {
                var squared = squareds[i];
                var count   = counts[i];
                res[i] = (float)Math.Sqrt(squared / (double)count);
            }
            return(res);
        }
        protected static void Process(string srcPath, string dstPath, bool test)
        {
            if (File.Exists(dstPath))
            {
                File.Delete(dstPath);
            }
            var fileStream    = File.OpenWrite(dstPath);
            var deflateStream = new DeflateStream(fileStream, CompressionMode.Compress);
            var writer        = new BinaryWriter(deflateStream);

            foreach (var rec in RawRecord.EnumerateBinLines(srcPath))
            {
                for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++)
                {
                    var catNo      = i + 1;
                    var idx        = RawRecord.NUMERIC_COUNT + i;
                    var val        = rec.Values[idx];
                    var testCount  = _testCounts[catNo][val];
                    var trainCount = _trainCounts[catNo][val];


                    if (testCount == 0)
                    {
                        rec.Values[idx] = Constants.VALUE_TRAINNOTTEST;
                        continue;
                    }
                    if (trainCount == 0)
                    {
                        rec.Values[idx] = Constants.VALUE_TESTNOTTRAIN;
                        //rec.Values[idx] = RawRecord.MISSING;
                        continue;
                    }

                    var threshHold = _categoricalValueFrequencyFilter[catNo];
                    if (trainCount < threshHold)
                    {
                        rec.Values[idx] = Constants.VALUE_LOWFREQUENCY;
                    }
                }
                rec.WriteBinary(writer);
            }
            writer.Flush();
            deflateStream.Flush();
            deflateStream.Close();
            fileStream.Close();
        }
        public static void CountFeatures(string path, bool test, bool checkValues = true)
        {
            Console.WriteLine("Counting features.." + ((test) ? "test" : "train"));
            if (_trainCounts == null)
            {
                _trainCounts = new Dictionary <int, int> [RawRecord.CATEGORICAL_COUNT + 1];
                _testCounts  = new Dictionary <int, int> [RawRecord.CATEGORICAL_COUNT + 1];
                for (var i = 0; i < RawRecord.CATEGORICAL_COUNT + 1; i++)
                {
                    _trainCounts[i] = new Dictionary <int, int>();
                    _testCounts[i]  = new Dictionary <int, int>();
                }
            }
            var recNo = 0;

            foreach (var rawLine in RawRecord.EnumerateBinLines(path))
            {
                for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++)
                {
                    var catNo = i + 1;
                    var val   = rawLine.Values[RawRecord.NUMERIC_COUNT + i];
                    IncFeature(_trainCounts[catNo], _testCounts[catNo], val, test, checkValues: checkValues);
                }
                recNo++;
            }

            var counts = _trainCounts;

            if (test)
            {
                counts = _testCounts;
            }
            for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++)
            {
                var catNo = i + 1;
                Console.WriteLine("CAT : " + catNo + " : " + counts[catNo].Count);
                Console.WriteLine("  MISSING : " + GetCount(counts, catNo, Constants.VALUE_MISSING));
                Console.WriteLine("  TESTNOTTRAIN : " + GetCount(counts, catNo, Constants.VALUE_TESTNOTTRAIN));
                Console.WriteLine("  TOOLOWCOUNT : " + GetCount(counts, catNo, Constants.VALUE_LOWFREQUENCY));
                Console.WriteLine("  TRAINNOTTEST : " + GetCount(counts, catNo, Constants.VALUE_TRAINNOTTEST));
            }
            Console.WriteLine("Total : " + " : " + counts.Sum(x => x.Count));
        }
Beispiel #5
0
        public static float[] GetMeans(params string[] srcPaths)
        {
            var means       = new float[NUMERIC_COUNT];
            var totals      = new double[NUMERIC_COUNT];
            var counts      = new int[NUMERIC_COUNT];
            var label1count = 0;
            var recordCount = 0;

            foreach (var srcPath in srcPaths)
            {
                foreach (var src in RawRecord.EnumerateBinLines(srcPath))
                {
                    if (src.Label != 0)
                    {
                        label1count++;
                    }
                    recordCount++;
                    for (var i = 0; i < NUMERIC_COUNT; i++)
                    {
                        var val = src.Values[i];
                        if (val > 0)
                        {
                            totals[i] += val;
                            counts[i] += 1;
                        }
                        else
                        {
                            recordCount = recordCount / 1;
                        }
                    }
                }
            }

            Console.WriteLine("1 labels : " + label1count + "//" + recordCount);
            for (var i = 0; i < NUMERIC_COUNT; i++)
            {
                means[i] = (float)totals[i] / (float)counts[i];
            }
            return(means);
        }