コード例 #1
0
        public static float[] GetStdDevs(float[] means, params string[] srcPaths)
        {
            var counts   = new int[NUMERIC_COUNT];
            var squareds = new double[NUMERIC_COUNT];
            var res      = new float[NUMERIC_COUNT];

            foreach (var srcPath in srcPaths)
            {
                foreach (var src in RawRecord.EnumerateBinLines(srcPath))
                {
                    for (var i = 0; i < NUMERIC_COUNT; i++)
                    {
                        var val = (double)src.Values[i];
                        if (val > 0)
                        {
                            var diff    = val - (double)means[i];
                            var squared = diff * diff;
                            squareds[i] += squared;
                            counts[i]   += 1;
                        }
                    }
                }
            }
            for (var i = 0; i < NUMERIC_COUNT; i++)
            {
                var squared = squareds[i];
                var count   = counts[i];
                res[i] = (float)Math.Sqrt(squared / (double)count);
            }
            return(res);
        }
コード例 #2
0
        protected static void ConvertRawToOneHot(string rawSrcPath, string oneHotDstPath, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false)
        {
            _hashIndices       = new Dictionary <string, int>();
            _categoricalIdices = new Dictionary <int, Dictionary <int, int> >();

            if (File.Exists(oneHotDstPath))
            {
                File.Delete(oneHotDstPath);
            }
            var stream = File.OpenWrite(oneHotDstPath);
            var writer = new BinaryWriter(stream);

            var label = (isTestSet) ? "test" : "train";

            Console.WriteLine("Converting " + label + " records");

            var recNo = 0;

            foreach (var raw in RawRecord.EnumerateBinLines(rawSrcPath))
            {
                var click = ConvertRecord(raw, recNo, isTestSet);
                click.WriteBinary(writer);
                recNo++;
            }

            writer.Flush();
            stream.Close();
        }
コード例 #3
0
        public static IEnumerable <RawRecord> EnumerateBinLines(string path)
        {
            var       fileStream    = File.OpenRead(path);
            var       deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress);
            var       reader        = new BinaryReader(deflateStream);
            var       lineNo        = 0;
            var       quit          = false;
            RawRecord rec;

            while (true)
            {
                lineNo++;
                try
                {
                    rec = new RawRecord(reader);
                }
                catch (EndOfStreamException ex)
                {
                    rec = null;
                }
                if (rec == null)
                {
                    break;
                }
                yield return(rec);

                if (lineNo % 1000000 == 0)
                {
                    Console.WriteLine("Line :  " + lineNo);
                }
            }

            fileStream.Close();
        }
コード例 #4
0
        public static List <RawRecord> LoadBin(string path)
        {
            var       fileStream    = File.OpenRead(path);
            var       deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress);
            var       reader        = new BinaryReader(deflateStream);
            var       res           = new List <RawRecord>();
            var       lineNo        = 0;
            RawRecord rec           = null;

            while (true)
            {
                lineNo++;
                try
                {
                    rec = new RawRecord(reader);
                }
                catch (EndOfStreamException ex)
                {
                    rec = null;
                }
                if (rec == null)
                {
                    break;
                }
                res.Add(rec);
                if (lineNo % 1000000 == 0)
                {
                    Console.WriteLine("Line :  " + lineNo);
                }
            }
            fileStream.Close();
            return(res);
        }
コード例 #5
0
ファイル: RawRecord.cs プロジェクト: zhimingz/kaggle_criteo
        public static IEnumerable<RawRecord> EnumerateCSVFile(string path)
        {
            var lineNo = 0;
            foreach (var line in File.ReadLines(path))
            {
                lineNo++;
                if (lineNo == 1) continue;
                var rec = new RawRecord(line);

                if (lineNo % 1000000 == 0) Console.WriteLine("Line :  " + lineNo);
                yield return rec;
            }
        }
コード例 #6
0
        protected static void Process(string srcPath, string dstPath, bool test)
        {
            if (File.Exists(dstPath))
            {
                File.Delete(dstPath);
            }
            var fileStream    = File.OpenWrite(dstPath);
            var deflateStream = new DeflateStream(fileStream, CompressionMode.Compress);
            var writer        = new BinaryWriter(deflateStream);

            foreach (var rec in RawRecord.EnumerateBinLines(srcPath))
            {
                for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++)
                {
                    var catNo      = i + 1;
                    var idx        = RawRecord.NUMERIC_COUNT + i;
                    var val        = rec.Values[idx];
                    var testCount  = _testCounts[catNo][val];
                    var trainCount = _trainCounts[catNo][val];


                    if (testCount == 0)
                    {
                        rec.Values[idx] = Constants.VALUE_TRAINNOTTEST;
                        continue;
                    }
                    if (trainCount == 0)
                    {
                        rec.Values[idx] = Constants.VALUE_TESTNOTTRAIN;
                        //rec.Values[idx] = RawRecord.MISSING;
                        continue;
                    }

                    var threshHold = _categoricalValueFrequencyFilter[catNo];
                    if (trainCount < threshHold)
                    {
                        rec.Values[idx] = Constants.VALUE_LOWFREQUENCY;
                    }
                }
                rec.WriteBinary(writer);
            }
            writer.Flush();
            deflateStream.Flush();
            deflateStream.Close();
            fileStream.Close();
        }
コード例 #7
0
        public static void CountFeatures(string path, bool test, bool checkValues = true)
        {
            Console.WriteLine("Counting features.." + ((test) ? "test" : "train"));
            if (_trainCounts == null)
            {
                _trainCounts = new Dictionary <int, int> [RawRecord.CATEGORICAL_COUNT + 1];
                _testCounts  = new Dictionary <int, int> [RawRecord.CATEGORICAL_COUNT + 1];
                for (var i = 0; i < RawRecord.CATEGORICAL_COUNT + 1; i++)
                {
                    _trainCounts[i] = new Dictionary <int, int>();
                    _testCounts[i]  = new Dictionary <int, int>();
                }
            }
            var recNo = 0;

            foreach (var rawLine in RawRecord.EnumerateBinLines(path))
            {
                for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++)
                {
                    var catNo = i + 1;
                    var val   = rawLine.Values[RawRecord.NUMERIC_COUNT + i];
                    IncFeature(_trainCounts[catNo], _testCounts[catNo], val, test, checkValues: checkValues);
                }
                recNo++;
            }

            var counts = _trainCounts;

            if (test)
            {
                counts = _testCounts;
            }
            for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++)
            {
                var catNo = i + 1;
                Console.WriteLine("CAT : " + catNo + " : " + counts[catNo].Count);
                Console.WriteLine("  MISSING : " + GetCount(counts, catNo, Constants.VALUE_MISSING));
                Console.WriteLine("  TESTNOTTRAIN : " + GetCount(counts, catNo, Constants.VALUE_TESTNOTTRAIN));
                Console.WriteLine("  TOOLOWCOUNT : " + GetCount(counts, catNo, Constants.VALUE_LOWFREQUENCY));
                Console.WriteLine("  TRAINNOTTEST : " + GetCount(counts, catNo, Constants.VALUE_TRAINNOTTEST));
            }
            Console.WriteLine("Total : " + " : " + counts.Sum(x => x.Count));
        }
コード例 #8
0
        public static IEnumerable <RawRecord> EnumerateCSVFile(string path)
        {
            var lineNo = 0;

            foreach (var line in File.ReadLines(path))
            {
                lineNo++;
                if (lineNo == 1)
                {
                    continue;
                }
                var rec = new RawRecord(line);

                if (lineNo % 1000000 == 0)
                {
                    Console.WriteLine("Line :  " + lineNo);
                }
                yield return(rec);
            }
        }
コード例 #9
0
        public static void ConvertCSVToBinary(string csvPath, string binaryPath)
        {
            Console.WriteLine("Converting CSV to binary");
            if (File.Exists(binaryPath))
            {
                File.Delete(binaryPath);
            }
            var fileStream    = File.OpenWrite(binaryPath);
            var deflateStream = new DeflateStream(fileStream, CompressionMode.Compress);
            var writer        = new BinaryWriter(deflateStream);

            foreach (var rawRecord in RawRecord.EnumerateCSVFile(csvPath))
            {
                rawRecord.WriteBinary(writer);
            }

            writer.Flush();
            deflateStream.Flush();
            deflateStream.Close();
            fileStream.Close();
        }
コード例 #10
0
        public static float[] GetMeans(params string[] srcPaths)
        {
            var means       = new float[NUMERIC_COUNT];
            var totals      = new double[NUMERIC_COUNT];
            var counts      = new int[NUMERIC_COUNT];
            var label1count = 0;
            var recordCount = 0;

            foreach (var srcPath in srcPaths)
            {
                foreach (var src in RawRecord.EnumerateBinLines(srcPath))
                {
                    if (src.Label != 0)
                    {
                        label1count++;
                    }
                    recordCount++;
                    for (var i = 0; i < NUMERIC_COUNT; i++)
                    {
                        var val = src.Values[i];
                        if (val > 0)
                        {
                            totals[i] += val;
                            counts[i] += 1;
                        }
                        else
                        {
                            recordCount = recordCount / 1;
                        }
                    }
                }
            }

            Console.WriteLine("1 labels : " + label1count + "//" + recordCount);
            for (var i = 0; i < NUMERIC_COUNT; i++)
            {
                means[i] = (float)totals[i] / (float)counts[i];
            }
            return(means);
        }
コード例 #11
0
        public static List <RawRecord> LoadCSV(string path)
        {
            var res    = new List <RawRecord>();
            var lineNo = 0;

            foreach (var line in File.ReadLines(path))
            {
                lineNo++;
                if (lineNo == 1)
                {
                    continue;
                }
                var rec = new RawRecord(line);

                if (lineNo % 1000000 == 0)
                {
                    Console.WriteLine("Line :  " + lineNo);
                }
                res.Add(rec);
            }

            return(res);
        }
コード例 #12
0
        public static OneHotRecord ConvertRecord(RawRecord raw, int recordIndex, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false)
        {
            var res = new OneHotRecord();

            res.Label = raw.Label;
            res.Id    = raw.Id;
            for (short i = 0; i < RawRecord.NUMERIC_COUNT; i++)
            {
                var colNo = i + 1;
                var val   = raw.Values[i];
                if (val == Int32.MinValue)
                {
                    res.NumericData[i] = float.NaN;
                    // Register N/A
                    res.SetNA(i);
                }
                else
                {
                    if (val != 0)
                    {
                        if (logTransformNumericValues)
                        {
                            val += 2;
                            if (colNo == 2)
                            {
                                val += 2;
                            }
                            val = (int)(Math.Log(val) * 100d);
                        }
                    }
                    res.NumericData[i] = val;
                }
            }

            bool isNew = false;

            for (short catNo = 0; catNo < RawRecord.CATEGORICAL_COUNT; catNo++)
            {
                var rawVal = raw.Values[RawRecord.NUMERIC_COUNT + catNo];

                // Recode testnottrain
                if (rawVal == Constants.VALUE_TESTNOTTRAIN)
                {
                    rawVal = encodeTestNotrainAs;
                }
                // Skip missing values ?
                if ((rawVal == Constants.VALUE_MISSING) && (!encodeMissingValues))
                {
                    continue;
                }

                if (_categoricalIdices.ContainsKey(catNo + 1))
                {
                    var catVal = GetCategorical(catNo + 1, rawVal, out isNew);
                    res.SetCategorical(catNo + 1, catVal);
                }
                else
                {
                    // Hashing trick
                    var   hash  = GetMurmurHash(catNo, rawVal);
                    sbyte value = 1;
                    if (hash < 0)
                    {
                        value = -1;
                        hash  = -hash;
                    }
                    var hashIndex = hash % Constants.HASH_SPACE_SIZE;
                    res.StoreHashedValue(hashIndex, value);
                }
            }

            return(res);
        }
コード例 #13
0
ファイル: RawRecord.cs プロジェクト: zhimingz/kaggle_criteo
        public static List<RawRecord> LoadCSV(string path)
        {
            var res = new List<RawRecord>();
            var lineNo = 0;
            foreach (var line in File.ReadLines(path))
            {
                lineNo++;
                if (lineNo == 1) continue;
                var rec = new RawRecord(line);

                if (lineNo % 1000000 == 0) Console.WriteLine("Line :  " + lineNo);
                res.Add(rec);
            }

            return res;
        }
コード例 #14
0
ファイル: RawRecord.cs プロジェクト: zhimingz/kaggle_criteo
        public static IEnumerable<RawRecord> EnumerateBinLines(string path)
        {
            var fileStream = File.OpenRead(path);
            var deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress);
            var reader = new BinaryReader(deflateStream);
            var lineNo = 0;
            var quit = false;
            RawRecord rec;
            while (true)
            {
                lineNo++;
                try
                {
                    rec = new RawRecord(reader);
                }
                catch (EndOfStreamException ex)
                {
                    rec = null;
                }
                if (rec == null) break;
                yield return rec;
                if (lineNo % 1000000 == 0) Console.WriteLine("Line :  " + lineNo);
            }

            fileStream.Close();
        }
コード例 #15
0
ファイル: RawRecord.cs プロジェクト: zhimingz/kaggle_criteo
 public static List<RawRecord> LoadBin(string path)
 {
     var fileStream = File.OpenRead(path);
     var deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress);
     var reader = new BinaryReader(deflateStream);
     var res = new List<RawRecord>();
     var lineNo = 0;
     RawRecord rec = null;
     while (true)
     {
         lineNo++;
         try
         {
             rec = new RawRecord(reader);
         }
         catch (EndOfStreamException ex)
         {
             rec = null;
         }
         if (rec == null) break;
         res.Add(rec);
         if (lineNo % 1000000 == 0) Console.WriteLine("Line :  " + lineNo);
     }
     fileStream.Close();
     return res;
 }
コード例 #16
0
        public static OneHotRecord ConvertRecord(RawRecord raw, int recordIndex, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false)
        {
            var res = new OneHotRecord();
            res.Label = raw.Label;
            res.Id = raw.Id;
            for (short i = 0; i < RawRecord.NUMERIC_COUNT; i++)
            {
                var colNo = i + 1;
                var val = raw.Values[i];
                if (val == Int32.MinValue) 
                {
                    res.NumericData[i] = float.NaN;
                    // Register N/A
                    res.SetNA(i);
                }
                else
                {
                    if (val != 0)
                    {
                        if (logTransformNumericValues)
                        {
                            val += 2;
                            if (colNo == 2) val += 2;
                            val = (int)(Math.Log(val) * 100d);
                        }
                    }
                    res.NumericData[i] = val;
                }
            }

            bool isNew = false;
            for (short catNo = 0; catNo < RawRecord.CATEGORICAL_COUNT; catNo++ )
            {
                var rawVal = raw.Values[RawRecord.NUMERIC_COUNT + catNo];

                // Recode testnottrain
                if (rawVal == Constants.VALUE_TESTNOTTRAIN) rawVal = encodeTestNotrainAs;
                // Skip missing values ?
                if ((rawVal == Constants.VALUE_MISSING) && (!encodeMissingValues)) continue;

                if (_categoricalIdices.ContainsKey(catNo + 1))
                {
                    var catVal = GetCategorical(catNo + 1, rawVal, out isNew);
                    res.SetCategorical(catNo + 1, catVal);
                }
                else
                {
                    // Hashing trick
                    var hash = GetMurmurHash(catNo, rawVal);
                    sbyte value = 1;
                    if (hash < 0)
                    {
                        value = -1;
                        hash = -hash;
                    }
                    var hashIndex = hash % Constants.HASH_SPACE_SIZE;
                    res.StoreHashedValue(hashIndex,value);
                }
            }

            return res;
        }