Ejemplo n.º 1
0
        public static float[] GetStdDevs(float[] means, params string[] srcPaths)
        {
            var counts   = new int[RawRecord.NUMERIC_COUNT];
            var squareds = new double[RawRecord.NUMERIC_COUNT];
            var res      = new float[RawRecord.NUMERIC_COUNT];

            foreach (var srcPath in srcPaths)
            {
                foreach (var src in OneHotRecord.EnumerateBinLines(srcPath))
                {
                    for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++)
                    {
                        var val = (double)src.NumericData[i];
                        if (!double.IsNaN(val))
                        {
                            var diff    = val - (double)means[i];
                            var squared = diff * diff;
                            squareds[i] += squared;
                            counts[i]   += 1;
                        }
                    }
                }
            }
            for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++)
            {
                var squared = squareds[i];
                var count   = counts[i];
                res[i] = (float)Math.Sqrt(squared / (double)count);
            }
            return(res);
        }
Ejemplo n.º 2
0
        public static IEnumerable <OneHotRecord> EnumerateBinLines(string path)
        {
            var          fileStream = File.OpenRead(path);
            var          reader     = new BinaryReader(fileStream);
            var          lineNo     = 0;
            var          quit       = false;
            OneHotRecord rec;

            while (!quit)
            {
                lineNo++;
                try
                {
                    rec = new OneHotRecord(reader);
                }
                catch (EndOfStreamException ex)
                {
                    rec = null;
                }
                if (rec == null)
                {
                    break;
                }

                yield return(rec);

                if (lineNo % 1000000 == 0)
                {
                    Console.WriteLine("Line :  " + lineNo);
                }
            }
            fileStream.Close();
        }
Ejemplo n.º 3
0
        public static void ScaleNumericValues(string srcTrainPath, string srcTestPath, string dstTrainPath, string dstTestPath)
        {
            Console.WriteLine("Computing means");
            var means = OneHotRecord.GetMeans(srcTrainPath, srcTestPath);

            Console.WriteLine("Computing stddevs");
            var stdevs = OneHotRecord.GetStdDevs(means, srcTrainPath, srcTestPath);
            var paths  = new List <string> {
                srcTrainPath + "^" + dstTrainPath, srcTestPath + "^" + dstTestPath
            };

            foreach (var pathItem in paths)
            {
                var pathItems = pathItem.Split('^');
                var srcPath   = pathItems[0];
                var dstPath   = pathItems[1];

                if (File.Exists(dstPath))
                {
                    File.Delete(dstPath);
                }
                var fileStream       = File.OpenWrite(dstPath);
                var compressedStream = new DeflateStream(fileStream, CompressionMode.Compress);
                var writer           = new BinaryWriter(compressedStream);

                Console.WriteLine("Standardizing" + Path.GetFileName(srcPath));
                var writeNo = 0;
                foreach (var rec in OneHotRecord.EnumerateBinLines(srcPath))
                {
                    for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++)
                    {
                        var val = rec.NumericData[i];
                        if (float.IsNaN(val))
                        {
                            rec.NumericData[i] = 0f;
                        }
                        else
                        {
                            var newVal = (rec.NumericData[i] - means[i]) / stdevs[i];
                            if (newVal > 3f)
                            {
                                newVal = 3f;
                            }
                            if (newVal < -3f)
                            {
                                newVal = -3f;
                            }
                            rec.NumericData[i] = newVal;
                        }
                    }
                    rec.WriteBinary(writer);
                    writeNo++;
                }
                writer.Flush();
                compressedStream.Flush();
                compressedStream.Close();
                fileStream.Close();
            }
        }
Ejemplo n.º 4
0
        public static float[] GetMeans(params string[] srcPaths)
        {
            var means       = new float[RawRecord.NUMERIC_COUNT];
            var totals      = new double[RawRecord.NUMERIC_COUNT];
            var counts      = new int[RawRecord.NUMERIC_COUNT];
            var label1count = 0;
            var recordCount = 0;

            foreach (var srcPath in srcPaths)
            {
                foreach (var src in OneHotRecord.EnumerateBinLines(srcPath))
                {
                    if (src.Label != 0)
                    {
                        label1count++;
                    }
                    recordCount++;
                    for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++)
                    {
                        var val = src.NumericData[i];
                        if (!float.IsNaN(val))
                        {
                            totals[i] += val;
                            counts[i] += 1;
                        }
                    }
                }
            }

            Console.WriteLine("Labels : " + label1count + "//" + recordCount);
            for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++)
            {
                means[i] = (float)totals[i] / (float)counts[i];
            }
            return(means);
        }
        public static OneHotRecord ConvertRecord(RawRecord raw, int recordIndex, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false)
        {
            var res = new OneHotRecord();

            res.Label = raw.Label;
            res.Id    = raw.Id;
            for (short i = 0; i < RawRecord.NUMERIC_COUNT; i++)
            {
                var colNo = i + 1;
                var val   = raw.Values[i];
                if (val == Int32.MinValue)
                {
                    res.NumericData[i] = float.NaN;
                    // Register N/A
                    res.SetNA(i);
                }
                else
                {
                    if (val != 0)
                    {
                        if (logTransformNumericValues)
                        {
                            val += 2;
                            if (colNo == 2)
                            {
                                val += 2;
                            }
                            val = (int)(Math.Log(val) * 100d);
                        }
                    }
                    res.NumericData[i] = val;
                }
            }

            bool isNew = false;

            for (short catNo = 0; catNo < RawRecord.CATEGORICAL_COUNT; catNo++)
            {
                var rawVal = raw.Values[RawRecord.NUMERIC_COUNT + catNo];

                // Recode testnottrain
                if (rawVal == Constants.VALUE_TESTNOTTRAIN)
                {
                    rawVal = encodeTestNotrainAs;
                }
                // Skip missing values ?
                if ((rawVal == Constants.VALUE_MISSING) && (!encodeMissingValues))
                {
                    continue;
                }

                if (_categoricalIdices.ContainsKey(catNo + 1))
                {
                    var catVal = GetCategorical(catNo + 1, rawVal, out isNew);
                    res.SetCategorical(catNo + 1, catVal);
                }
                else
                {
                    // Hashing trick
                    var   hash  = GetMurmurHash(catNo, rawVal);
                    sbyte value = 1;
                    if (hash < 0)
                    {
                        value = -1;
                        hash  = -hash;
                    }
                    var hashIndex = hash % Constants.HASH_SPACE_SIZE;
                    res.StoreHashedValue(hashIndex, value);
                }
            }

            return(res);
        }
        public static OneHotRecord ConvertRecord(RawRecord raw, int recordIndex, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false)
        {
            var res = new OneHotRecord();
            res.Label = raw.Label;
            res.Id = raw.Id;
            for (short i = 0; i < RawRecord.NUMERIC_COUNT; i++)
            {
                var colNo = i + 1;
                var val = raw.Values[i];
                if (val == Int32.MinValue) 
                {
                    res.NumericData[i] = float.NaN;
                    // Register N/A
                    res.SetNA(i);
                }
                else
                {
                    if (val != 0)
                    {
                        if (logTransformNumericValues)
                        {
                            val += 2;
                            if (colNo == 2) val += 2;
                            val = (int)(Math.Log(val) * 100d);
                        }
                    }
                    res.NumericData[i] = val;
                }
            }

            bool isNew = false;
            for (short catNo = 0; catNo < RawRecord.CATEGORICAL_COUNT; catNo++ )
            {
                var rawVal = raw.Values[RawRecord.NUMERIC_COUNT + catNo];

                // Recode testnottrain
                if (rawVal == Constants.VALUE_TESTNOTTRAIN) rawVal = encodeTestNotrainAs;
                // Skip missing values ?
                if ((rawVal == Constants.VALUE_MISSING) && (!encodeMissingValues)) continue;

                if (_categoricalIdices.ContainsKey(catNo + 1))
                {
                    var catVal = GetCategorical(catNo + 1, rawVal, out isNew);
                    res.SetCategorical(catNo + 1, catVal);
                }
                else
                {
                    // Hashing trick
                    var hash = GetMurmurHash(catNo, rawVal);
                    sbyte value = 1;
                    if (hash < 0)
                    {
                        value = -1;
                        hash = -hash;
                    }
                    var hashIndex = hash % Constants.HASH_SPACE_SIZE;
                    res.StoreHashedValue(hashIndex,value);
                }
            }

            return res;
        }
Ejemplo n.º 7
0
        public static IEnumerable<OneHotRecord> EnumerateBinLines(string path)
        {
            var fileStream = File.OpenRead(path);
            var reader = new BinaryReader(fileStream);
            var lineNo = 0;
            var quit = false;
            OneHotRecord rec;
            while (!quit)
            {
                lineNo++;
                try
                {
                    rec = new OneHotRecord(reader);
                }
                catch (EndOfStreamException ex)
                {
                    rec = null;
                }
                if (rec == null) break;

                yield return rec;
                if (lineNo % 1000000 == 0) Console.WriteLine("Line :  " + lineNo);
            }
            fileStream.Close();
        }