public static float[] GetStdDevs(float[] means, params string[] srcPaths) { var counts = new int[RawRecord.NUMERIC_COUNT]; var squareds = new double[RawRecord.NUMERIC_COUNT]; var res = new float[RawRecord.NUMERIC_COUNT]; foreach (var srcPath in srcPaths) { foreach (var src in OneHotRecord.EnumerateBinLines(srcPath)) { for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var val = (double)src.NumericData[i]; if (!double.IsNaN(val)) { var diff = val - (double)means[i]; var squared = diff * diff; squareds[i] += squared; counts[i] += 1; } } } } for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var squared = squareds[i]; var count = counts[i]; res[i] = (float)Math.Sqrt(squared / (double)count); } return(res); }
public static IEnumerable <OneHotRecord> EnumerateBinLines(string path) { var fileStream = File.OpenRead(path); var reader = new BinaryReader(fileStream); var lineNo = 0; var quit = false; OneHotRecord rec; while (!quit) { lineNo++; try { rec = new OneHotRecord(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) { break; } yield return(rec); if (lineNo % 1000000 == 0) { Console.WriteLine("Line : " + lineNo); } } fileStream.Close(); }
public static void ScaleNumericValues(string srcTrainPath, string srcTestPath, string dstTrainPath, string dstTestPath) { Console.WriteLine("Computing means"); var means = OneHotRecord.GetMeans(srcTrainPath, srcTestPath); Console.WriteLine("Computing stddevs"); var stdevs = OneHotRecord.GetStdDevs(means, srcTrainPath, srcTestPath); var paths = new List <string> { srcTrainPath + "^" + dstTrainPath, srcTestPath + "^" + dstTestPath }; foreach (var pathItem in paths) { var pathItems = pathItem.Split('^'); var srcPath = pathItems[0]; var dstPath = pathItems[1]; if (File.Exists(dstPath)) { File.Delete(dstPath); } var fileStream = File.OpenWrite(dstPath); var compressedStream = new DeflateStream(fileStream, CompressionMode.Compress); var writer = new BinaryWriter(compressedStream); Console.WriteLine("Standardizing" + Path.GetFileName(srcPath)); var writeNo = 0; foreach (var rec in OneHotRecord.EnumerateBinLines(srcPath)) { for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var val = rec.NumericData[i]; if (float.IsNaN(val)) { rec.NumericData[i] = 0f; } else { var newVal = (rec.NumericData[i] - means[i]) / stdevs[i]; if (newVal > 3f) { newVal = 3f; } if (newVal < -3f) { newVal = -3f; } rec.NumericData[i] = newVal; } } rec.WriteBinary(writer); writeNo++; } writer.Flush(); compressedStream.Flush(); compressedStream.Close(); fileStream.Close(); } }
public static float[] GetMeans(params string[] srcPaths) { var means = new float[RawRecord.NUMERIC_COUNT]; var totals = new double[RawRecord.NUMERIC_COUNT]; var counts = new int[RawRecord.NUMERIC_COUNT]; var label1count = 0; var recordCount = 0; foreach (var srcPath in srcPaths) { foreach (var src in OneHotRecord.EnumerateBinLines(srcPath)) { if (src.Label != 0) { label1count++; } recordCount++; for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var val = src.NumericData[i]; if (!float.IsNaN(val)) { totals[i] += val; counts[i] += 1; } } } } Console.WriteLine("Labels : " + label1count + "//" + recordCount); for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { means[i] = (float)totals[i] / (float)counts[i]; } return(means); }
public static OneHotRecord ConvertRecord(RawRecord raw, int recordIndex, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false) { var res = new OneHotRecord(); res.Label = raw.Label; res.Id = raw.Id; for (short i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var colNo = i + 1; var val = raw.Values[i]; if (val == Int32.MinValue) { res.NumericData[i] = float.NaN; // Register N/A res.SetNA(i); } else { if (val != 0) { if (logTransformNumericValues) { val += 2; if (colNo == 2) { val += 2; } val = (int)(Math.Log(val) * 100d); } } res.NumericData[i] = val; } } bool isNew = false; for (short catNo = 0; catNo < RawRecord.CATEGORICAL_COUNT; catNo++) { var rawVal = raw.Values[RawRecord.NUMERIC_COUNT + catNo]; // Recode testnottrain if (rawVal == Constants.VALUE_TESTNOTTRAIN) { rawVal = encodeTestNotrainAs; } // Skip missing values ? if ((rawVal == Constants.VALUE_MISSING) && (!encodeMissingValues)) { continue; } if (_categoricalIdices.ContainsKey(catNo + 1)) { var catVal = GetCategorical(catNo + 1, rawVal, out isNew); res.SetCategorical(catNo + 1, catVal); } else { // Hashing trick var hash = GetMurmurHash(catNo, rawVal); sbyte value = 1; if (hash < 0) { value = -1; hash = -hash; } var hashIndex = hash % Constants.HASH_SPACE_SIZE; res.StoreHashedValue(hashIndex, value); } } return(res); }
public static OneHotRecord ConvertRecord(RawRecord raw, int recordIndex, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false) { var res = new OneHotRecord(); res.Label = raw.Label; res.Id = raw.Id; for (short i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var colNo = i + 1; var val = raw.Values[i]; if (val == Int32.MinValue) { res.NumericData[i] = float.NaN; // Register N/A res.SetNA(i); } else { if (val != 0) { if (logTransformNumericValues) { val += 2; if (colNo == 2) val += 2; val = (int)(Math.Log(val) * 100d); } } res.NumericData[i] = val; } } bool isNew = false; for (short catNo = 0; catNo < RawRecord.CATEGORICAL_COUNT; catNo++ ) { var rawVal = raw.Values[RawRecord.NUMERIC_COUNT + catNo]; // Recode testnottrain if (rawVal == Constants.VALUE_TESTNOTTRAIN) rawVal = encodeTestNotrainAs; // Skip missing values ? if ((rawVal == Constants.VALUE_MISSING) && (!encodeMissingValues)) continue; if (_categoricalIdices.ContainsKey(catNo + 1)) { var catVal = GetCategorical(catNo + 1, rawVal, out isNew); res.SetCategorical(catNo + 1, catVal); } else { // Hashing trick var hash = GetMurmurHash(catNo, rawVal); sbyte value = 1; if (hash < 0) { value = -1; hash = -hash; } var hashIndex = hash % Constants.HASH_SPACE_SIZE; res.StoreHashedValue(hashIndex,value); } } return res; }
public static IEnumerable<OneHotRecord> EnumerateBinLines(string path) { var fileStream = File.OpenRead(path); var reader = new BinaryReader(fileStream); var lineNo = 0; var quit = false; OneHotRecord rec; while (!quit) { lineNo++; try { rec = new OneHotRecord(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) break; yield return rec; if (lineNo % 1000000 == 0) Console.WriteLine("Line : " + lineNo); } fileStream.Close(); }