protected static void ConvertRawToOneHot(string rawSrcPath, string oneHotDstPath, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false) { _hashIndices = new Dictionary <string, int>(); _categoricalIdices = new Dictionary <int, Dictionary <int, int> >(); if (File.Exists(oneHotDstPath)) { File.Delete(oneHotDstPath); } var stream = File.OpenWrite(oneHotDstPath); var writer = new BinaryWriter(stream); var label = (isTestSet) ? "test" : "train"; Console.WriteLine("Converting " + label + " records"); var recNo = 0; foreach (var raw in RawRecord.EnumerateBinLines(rawSrcPath)) { var click = ConvertRecord(raw, recNo, isTestSet); click.WriteBinary(writer); recNo++; } writer.Flush(); stream.Close(); }
public static float[] GetStdDevs(float[] means, params string[] srcPaths) { var counts = new int[NUMERIC_COUNT]; var squareds = new double[NUMERIC_COUNT]; var res = new float[NUMERIC_COUNT]; foreach (var srcPath in srcPaths) { foreach (var src in RawRecord.EnumerateBinLines(srcPath)) { for (var i = 0; i < NUMERIC_COUNT; i++) { var val = (double)src.Values[i]; if (val > 0) { var diff = val - (double)means[i]; var squared = diff * diff; squareds[i] += squared; counts[i] += 1; } } } } for (var i = 0; i < NUMERIC_COUNT; i++) { var squared = squareds[i]; var count = counts[i]; res[i] = (float)Math.Sqrt(squared / (double)count); } return(res); }
protected static void Process(string srcPath, string dstPath, bool test) { if (File.Exists(dstPath)) { File.Delete(dstPath); } var fileStream = File.OpenWrite(dstPath); var deflateStream = new DeflateStream(fileStream, CompressionMode.Compress); var writer = new BinaryWriter(deflateStream); foreach (var rec in RawRecord.EnumerateBinLines(srcPath)) { for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++) { var catNo = i + 1; var idx = RawRecord.NUMERIC_COUNT + i; var val = rec.Values[idx]; var testCount = _testCounts[catNo][val]; var trainCount = _trainCounts[catNo][val]; if (testCount == 0) { rec.Values[idx] = Constants.VALUE_TRAINNOTTEST; continue; } if (trainCount == 0) { rec.Values[idx] = Constants.VALUE_TESTNOTTRAIN; //rec.Values[idx] = RawRecord.MISSING; continue; } var threshHold = _categoricalValueFrequencyFilter[catNo]; if (trainCount < threshHold) { rec.Values[idx] = Constants.VALUE_LOWFREQUENCY; } } rec.WriteBinary(writer); } writer.Flush(); deflateStream.Flush(); deflateStream.Close(); fileStream.Close(); }
public static void CountFeatures(string path, bool test, bool checkValues = true) { Console.WriteLine("Counting features.." + ((test) ? "test" : "train")); if (_trainCounts == null) { _trainCounts = new Dictionary <int, int> [RawRecord.CATEGORICAL_COUNT + 1]; _testCounts = new Dictionary <int, int> [RawRecord.CATEGORICAL_COUNT + 1]; for (var i = 0; i < RawRecord.CATEGORICAL_COUNT + 1; i++) { _trainCounts[i] = new Dictionary <int, int>(); _testCounts[i] = new Dictionary <int, int>(); } } var recNo = 0; foreach (var rawLine in RawRecord.EnumerateBinLines(path)) { for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++) { var catNo = i + 1; var val = rawLine.Values[RawRecord.NUMERIC_COUNT + i]; IncFeature(_trainCounts[catNo], _testCounts[catNo], val, test, checkValues: checkValues); } recNo++; } var counts = _trainCounts; if (test) { counts = _testCounts; } for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++) { var catNo = i + 1; Console.WriteLine("CAT : " + catNo + " : " + counts[catNo].Count); Console.WriteLine(" MISSING : " + GetCount(counts, catNo, Constants.VALUE_MISSING)); Console.WriteLine(" TESTNOTTRAIN : " + GetCount(counts, catNo, Constants.VALUE_TESTNOTTRAIN)); Console.WriteLine(" TOOLOWCOUNT : " + GetCount(counts, catNo, Constants.VALUE_LOWFREQUENCY)); Console.WriteLine(" TRAINNOTTEST : " + GetCount(counts, catNo, Constants.VALUE_TRAINNOTTEST)); } Console.WriteLine("Total : " + " : " + counts.Sum(x => x.Count)); }
public static float[] GetMeans(params string[] srcPaths) { var means = new float[NUMERIC_COUNT]; var totals = new double[NUMERIC_COUNT]; var counts = new int[NUMERIC_COUNT]; var label1count = 0; var recordCount = 0; foreach (var srcPath in srcPaths) { foreach (var src in RawRecord.EnumerateBinLines(srcPath)) { if (src.Label != 0) { label1count++; } recordCount++; for (var i = 0; i < NUMERIC_COUNT; i++) { var val = src.Values[i]; if (val > 0) { totals[i] += val; counts[i] += 1; } else { recordCount = recordCount / 1; } } } } Console.WriteLine("1 labels : " + label1count + "//" + recordCount); for (var i = 0; i < NUMERIC_COUNT; i++) { means[i] = (float)totals[i] / (float)counts[i]; } return(means); }