public static float[] GetStdDevs(float[] means, params string[] srcPaths) { var counts = new int[NUMERIC_COUNT]; var squareds = new double[NUMERIC_COUNT]; var res = new float[NUMERIC_COUNT]; foreach (var srcPath in srcPaths) { foreach (var src in RawRecord.EnumerateBinLines(srcPath)) { for (var i = 0; i < NUMERIC_COUNT; i++) { var val = (double)src.Values[i]; if (val > 0) { var diff = val - (double)means[i]; var squared = diff * diff; squareds[i] += squared; counts[i] += 1; } } } } for (var i = 0; i < NUMERIC_COUNT; i++) { var squared = squareds[i]; var count = counts[i]; res[i] = (float)Math.Sqrt(squared / (double)count); } return(res); }
protected static void ConvertRawToOneHot(string rawSrcPath, string oneHotDstPath, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false) { _hashIndices = new Dictionary <string, int>(); _categoricalIdices = new Dictionary <int, Dictionary <int, int> >(); if (File.Exists(oneHotDstPath)) { File.Delete(oneHotDstPath); } var stream = File.OpenWrite(oneHotDstPath); var writer = new BinaryWriter(stream); var label = (isTestSet) ? "test" : "train"; Console.WriteLine("Converting " + label + " records"); var recNo = 0; foreach (var raw in RawRecord.EnumerateBinLines(rawSrcPath)) { var click = ConvertRecord(raw, recNo, isTestSet); click.WriteBinary(writer); recNo++; } writer.Flush(); stream.Close(); }
public static IEnumerable <RawRecord> EnumerateBinLines(string path) { var fileStream = File.OpenRead(path); var deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress); var reader = new BinaryReader(deflateStream); var lineNo = 0; var quit = false; RawRecord rec; while (true) { lineNo++; try { rec = new RawRecord(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) { break; } yield return(rec); if (lineNo % 1000000 == 0) { Console.WriteLine("Line : " + lineNo); } } fileStream.Close(); }
public static List <RawRecord> LoadBin(string path) { var fileStream = File.OpenRead(path); var deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress); var reader = new BinaryReader(deflateStream); var res = new List <RawRecord>(); var lineNo = 0; RawRecord rec = null; while (true) { lineNo++; try { rec = new RawRecord(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) { break; } res.Add(rec); if (lineNo % 1000000 == 0) { Console.WriteLine("Line : " + lineNo); } } fileStream.Close(); return(res); }
public static IEnumerable<RawRecord> EnumerateCSVFile(string path) { var lineNo = 0; foreach (var line in File.ReadLines(path)) { lineNo++; if (lineNo == 1) continue; var rec = new RawRecord(line); if (lineNo % 1000000 == 0) Console.WriteLine("Line : " + lineNo); yield return rec; } }
protected static void Process(string srcPath, string dstPath, bool test) { if (File.Exists(dstPath)) { File.Delete(dstPath); } var fileStream = File.OpenWrite(dstPath); var deflateStream = new DeflateStream(fileStream, CompressionMode.Compress); var writer = new BinaryWriter(deflateStream); foreach (var rec in RawRecord.EnumerateBinLines(srcPath)) { for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++) { var catNo = i + 1; var idx = RawRecord.NUMERIC_COUNT + i; var val = rec.Values[idx]; var testCount = _testCounts[catNo][val]; var trainCount = _trainCounts[catNo][val]; if (testCount == 0) { rec.Values[idx] = Constants.VALUE_TRAINNOTTEST; continue; } if (trainCount == 0) { rec.Values[idx] = Constants.VALUE_TESTNOTTRAIN; //rec.Values[idx] = RawRecord.MISSING; continue; } var threshHold = _categoricalValueFrequencyFilter[catNo]; if (trainCount < threshHold) { rec.Values[idx] = Constants.VALUE_LOWFREQUENCY; } } rec.WriteBinary(writer); } writer.Flush(); deflateStream.Flush(); deflateStream.Close(); fileStream.Close(); }
public static void CountFeatures(string path, bool test, bool checkValues = true) { Console.WriteLine("Counting features.." + ((test) ? "test" : "train")); if (_trainCounts == null) { _trainCounts = new Dictionary <int, int> [RawRecord.CATEGORICAL_COUNT + 1]; _testCounts = new Dictionary <int, int> [RawRecord.CATEGORICAL_COUNT + 1]; for (var i = 0; i < RawRecord.CATEGORICAL_COUNT + 1; i++) { _trainCounts[i] = new Dictionary <int, int>(); _testCounts[i] = new Dictionary <int, int>(); } } var recNo = 0; foreach (var rawLine in RawRecord.EnumerateBinLines(path)) { for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++) { var catNo = i + 1; var val = rawLine.Values[RawRecord.NUMERIC_COUNT + i]; IncFeature(_trainCounts[catNo], _testCounts[catNo], val, test, checkValues: checkValues); } recNo++; } var counts = _trainCounts; if (test) { counts = _testCounts; } for (var i = 0; i < RawRecord.CATEGORICAL_COUNT; i++) { var catNo = i + 1; Console.WriteLine("CAT : " + catNo + " : " + counts[catNo].Count); Console.WriteLine(" MISSING : " + GetCount(counts, catNo, Constants.VALUE_MISSING)); Console.WriteLine(" TESTNOTTRAIN : " + GetCount(counts, catNo, Constants.VALUE_TESTNOTTRAIN)); Console.WriteLine(" TOOLOWCOUNT : " + GetCount(counts, catNo, Constants.VALUE_LOWFREQUENCY)); Console.WriteLine(" TRAINNOTTEST : " + GetCount(counts, catNo, Constants.VALUE_TRAINNOTTEST)); } Console.WriteLine("Total : " + " : " + counts.Sum(x => x.Count)); }
public static IEnumerable <RawRecord> EnumerateCSVFile(string path) { var lineNo = 0; foreach (var line in File.ReadLines(path)) { lineNo++; if (lineNo == 1) { continue; } var rec = new RawRecord(line); if (lineNo % 1000000 == 0) { Console.WriteLine("Line : " + lineNo); } yield return(rec); } }
public static void ConvertCSVToBinary(string csvPath, string binaryPath) { Console.WriteLine("Converting CSV to binary"); if (File.Exists(binaryPath)) { File.Delete(binaryPath); } var fileStream = File.OpenWrite(binaryPath); var deflateStream = new DeflateStream(fileStream, CompressionMode.Compress); var writer = new BinaryWriter(deflateStream); foreach (var rawRecord in RawRecord.EnumerateCSVFile(csvPath)) { rawRecord.WriteBinary(writer); } writer.Flush(); deflateStream.Flush(); deflateStream.Close(); fileStream.Close(); }
public static float[] GetMeans(params string[] srcPaths) { var means = new float[NUMERIC_COUNT]; var totals = new double[NUMERIC_COUNT]; var counts = new int[NUMERIC_COUNT]; var label1count = 0; var recordCount = 0; foreach (var srcPath in srcPaths) { foreach (var src in RawRecord.EnumerateBinLines(srcPath)) { if (src.Label != 0) { label1count++; } recordCount++; for (var i = 0; i < NUMERIC_COUNT; i++) { var val = src.Values[i]; if (val > 0) { totals[i] += val; counts[i] += 1; } else { recordCount = recordCount / 1; } } } } Console.WriteLine("1 labels : " + label1count + "//" + recordCount); for (var i = 0; i < NUMERIC_COUNT; i++) { means[i] = (float)totals[i] / (float)counts[i]; } return(means); }
public static List <RawRecord> LoadCSV(string path) { var res = new List <RawRecord>(); var lineNo = 0; foreach (var line in File.ReadLines(path)) { lineNo++; if (lineNo == 1) { continue; } var rec = new RawRecord(line); if (lineNo % 1000000 == 0) { Console.WriteLine("Line : " + lineNo); } res.Add(rec); } return(res); }
public static OneHotRecord ConvertRecord(RawRecord raw, int recordIndex, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false) { var res = new OneHotRecord(); res.Label = raw.Label; res.Id = raw.Id; for (short i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var colNo = i + 1; var val = raw.Values[i]; if (val == Int32.MinValue) { res.NumericData[i] = float.NaN; // Register N/A res.SetNA(i); } else { if (val != 0) { if (logTransformNumericValues) { val += 2; if (colNo == 2) { val += 2; } val = (int)(Math.Log(val) * 100d); } } res.NumericData[i] = val; } } bool isNew = false; for (short catNo = 0; catNo < RawRecord.CATEGORICAL_COUNT; catNo++) { var rawVal = raw.Values[RawRecord.NUMERIC_COUNT + catNo]; // Recode testnottrain if (rawVal == Constants.VALUE_TESTNOTTRAIN) { rawVal = encodeTestNotrainAs; } // Skip missing values ? if ((rawVal == Constants.VALUE_MISSING) && (!encodeMissingValues)) { continue; } if (_categoricalIdices.ContainsKey(catNo + 1)) { var catVal = GetCategorical(catNo + 1, rawVal, out isNew); res.SetCategorical(catNo + 1, catVal); } else { // Hashing trick var hash = GetMurmurHash(catNo, rawVal); sbyte value = 1; if (hash < 0) { value = -1; hash = -hash; } var hashIndex = hash % Constants.HASH_SPACE_SIZE; res.StoreHashedValue(hashIndex, value); } } return(res); }
public static List<RawRecord> LoadCSV(string path) { var res = new List<RawRecord>(); var lineNo = 0; foreach (var line in File.ReadLines(path)) { lineNo++; if (lineNo == 1) continue; var rec = new RawRecord(line); if (lineNo % 1000000 == 0) Console.WriteLine("Line : " + lineNo); res.Add(rec); } return res; }
public static IEnumerable<RawRecord> EnumerateBinLines(string path) { var fileStream = File.OpenRead(path); var deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress); var reader = new BinaryReader(deflateStream); var lineNo = 0; var quit = false; RawRecord rec; while (true) { lineNo++; try { rec = new RawRecord(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) break; yield return rec; if (lineNo % 1000000 == 0) Console.WriteLine("Line : " + lineNo); } fileStream.Close(); }
public static List<RawRecord> LoadBin(string path) { var fileStream = File.OpenRead(path); var deflateStream = new DeflateStream(fileStream, CompressionMode.Decompress); var reader = new BinaryReader(deflateStream); var res = new List<RawRecord>(); var lineNo = 0; RawRecord rec = null; while (true) { lineNo++; try { rec = new RawRecord(reader); } catch (EndOfStreamException ex) { rec = null; } if (rec == null) break; res.Add(rec); if (lineNo % 1000000 == 0) Console.WriteLine("Line : " + lineNo); } fileStream.Close(); return res; }
public static OneHotRecord ConvertRecord(RawRecord raw, int recordIndex, bool isTestSet, bool encodeMissingValues = true, int encodeTestNotrainAs = Constants.VALUE_MISSING, bool logTransformNumericValues = false) { var res = new OneHotRecord(); res.Label = raw.Label; res.Id = raw.Id; for (short i = 0; i < RawRecord.NUMERIC_COUNT; i++) { var colNo = i + 1; var val = raw.Values[i]; if (val == Int32.MinValue) { res.NumericData[i] = float.NaN; // Register N/A res.SetNA(i); } else { if (val != 0) { if (logTransformNumericValues) { val += 2; if (colNo == 2) val += 2; val = (int)(Math.Log(val) * 100d); } } res.NumericData[i] = val; } } bool isNew = false; for (short catNo = 0; catNo < RawRecord.CATEGORICAL_COUNT; catNo++ ) { var rawVal = raw.Values[RawRecord.NUMERIC_COUNT + catNo]; // Recode testnottrain if (rawVal == Constants.VALUE_TESTNOTTRAIN) rawVal = encodeTestNotrainAs; // Skip missing values ? if ((rawVal == Constants.VALUE_MISSING) && (!encodeMissingValues)) continue; if (_categoricalIdices.ContainsKey(catNo + 1)) { var catVal = GetCategorical(catNo + 1, rawVal, out isNew); res.SetCategorical(catNo + 1, catVal); } else { // Hashing trick var hash = GetMurmurHash(catNo, rawVal); sbyte value = 1; if (hash < 0) { value = -1; hash = -hash; } var hashIndex = hash % Constants.HASH_SPACE_SIZE; res.StoreHashedValue(hashIndex,value); } } return res; }