public static float[] GetStdDevs(float[] means, params string[] srcPaths) { var counts = new int[RawRecord.NUMERIC_COUNT]; var squareds = new double[RawRecord.NUMERIC_COUNT]; var res = new float[RawRecord.NUMERIC_COUNT]; foreach (var srcPath in srcPaths) { foreach (var src in OneHotRecord.EnumerateBinLines(srcPath)) { for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var val = (double)src.NumericData[i]; if (!double.IsNaN(val)) { var diff = val - (double)means[i]; var squared = diff * diff; squareds[i] += squared; counts[i] += 1; } } } } for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var squared = squareds[i]; var count = counts[i]; res[i] = (float)Math.Sqrt(squared / (double)count); } return(res); }
public static void ScaleNumericValues(string srcTrainPath, string srcTestPath, string dstTrainPath, string dstTestPath) { Console.WriteLine("Computing means"); var means = OneHotRecord.GetMeans(srcTrainPath, srcTestPath); Console.WriteLine("Computing stddevs"); var stdevs = OneHotRecord.GetStdDevs(means, srcTrainPath, srcTestPath); var paths = new List <string> { srcTrainPath + "^" + dstTrainPath, srcTestPath + "^" + dstTestPath }; foreach (var pathItem in paths) { var pathItems = pathItem.Split('^'); var srcPath = pathItems[0]; var dstPath = pathItems[1]; if (File.Exists(dstPath)) { File.Delete(dstPath); } var fileStream = File.OpenWrite(dstPath); var compressedStream = new DeflateStream(fileStream, CompressionMode.Compress); var writer = new BinaryWriter(compressedStream); Console.WriteLine("Standardizing" + Path.GetFileName(srcPath)); var writeNo = 0; foreach (var rec in OneHotRecord.EnumerateBinLines(srcPath)) { for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var val = rec.NumericData[i]; if (float.IsNaN(val)) { rec.NumericData[i] = 0f; } else { var newVal = (rec.NumericData[i] - means[i]) / stdevs[i]; if (newVal > 3f) { newVal = 3f; } if (newVal < -3f) { newVal = -3f; } rec.NumericData[i] = newVal; } } rec.WriteBinary(writer); writeNo++; } writer.Flush(); compressedStream.Flush(); compressedStream.Close(); fileStream.Close(); } }
public static float[] GetMeans(params string[] srcPaths) { var means = new float[RawRecord.NUMERIC_COUNT]; var totals = new double[RawRecord.NUMERIC_COUNT]; var counts = new int[RawRecord.NUMERIC_COUNT]; var label1count = 0; var recordCount = 0; foreach (var srcPath in srcPaths) { foreach (var src in OneHotRecord.EnumerateBinLines(srcPath)) { if (src.Label != 0) { label1count++; } recordCount++; for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var val = src.NumericData[i]; if (!float.IsNaN(val)) { totals[i] += val; counts[i] += 1; } } } } Console.WriteLine("Labels : " + label1count + "//" + recordCount); for (var i = 0; i < RawRecord.NUMERIC_COUNT; i++) { means[i] = (float)totals[i] / (float)counts[i]; } return(means); }