/// <summary> /// Reads data from train and test files, pre-modification /// </summary> public override void LoadData() { _trainLoader = TargetName != null ? new DataLoader <FType>(TargetName) : new DataLoader <FType>(); _testLoader = TargetName != null ? new DataLoader <FType>(TargetName) : new DataLoader <FType>(); if (!File.Exists(TrainPath)) { Logger.Log("train file " + TrainPath + " not found"); throw new FileNotFoundException("", TrainPath); } if (!File.Exists(TestPath)) { Logger.Log("test file " + TestPath + " not found"); throw new FileNotFoundException("", TestPath); } // loading train file _trainLoader.IsLoadForLearning = true; if (IdName != null) { _trainLoader.AddIdsString(IdName); } _trainLoader.Load(TrainPath); foreach (var key in _trainLoader.TargetProb.Keys) { Logger.Log("prob[" + key.ToString("F0") + "] = " + _trainLoader.TargetProb[key].ToString("F06")); } Logger.Log("Outliers to drop: " + (int)(_trainLoader.TotalDataLines * OutliersPrct)); // loading test file foreach (var id in _trainLoader.Ids.Keys) // the same id's { _testLoader.AddIdColumn(id); } foreach (var col in _trainLoader.SkippedColumns.Keys) // the same columns { _testLoader.AddSkipColumn(col); } // loading test file _testLoader.Load(TestPath); ModifyData(); }
public override void LoadData() { string trainPath = ""; string testPath = ""; string ids = ""; string target = ""; _trainPath = trainPath; _testPath = testPath; _target = target; _trainLoader = _target != null ? new DataLoader <FType>(_target) : new DataLoader <FType>(); _testLoader = _target != null ? new DataLoader <FType>(_target) : new DataLoader <FType>(); if (!File.Exists(_trainPath)) { Logger.Log("train file " + _trainPath + " not found"); throw new FileNotFoundException("", _trainPath); } if (!File.Exists(_trainPath)) { Logger.Log("test file " + _testPath + " not found"); throw new FileNotFoundException("", _testPath); } // loading train file _trainLoader.AddIdsString(ids); _trainLoader.ProceedRowFunc = ProceedRow; _trainLoader.Load(_trainPath); _trainProblem = _problemCreator.CreateProblem(); // loading test file foreach (var id in _trainLoader.Ids.Keys) // the same id's { _testLoader.AddIdColumn(id); } foreach (var col in _trainLoader.SkippedColumns.Keys) // the same columns { _testLoader.AddSkipColumn(col); } // loading _testLoader.Load(_testPath); _testDataDict = new Dictionary <string, List <double[]> >(); // тестовые данные: id -> список строк на данный id _resultDict = new Dictionary <string, int>(); // результат тестовых данных: id -> target // модифицируем тестовые данные foreach (var row in _testLoader.Rows) { // сохраняем результат if (!_resultDict.ContainsKey(row.Id)) { _resultDict.Add(row.Id, Convert.ToInt32(row.Target)); } // сохраняем ответ из бюро var txy = new double[_testLoader.NVars]; for (int k = 0; k < _testLoader.NVars; k++) { txy[k] = row.Values[k]; } if (!_testDataDict.ContainsKey(row.Id)) { _testDataDict.Add(row.Id, new List <double[]>()); } _testDataDict[row.Id].Add(txy); } }
/// <summary> /// Reads data from train and test files, pre-modification /// </summary> public override void LoadData() { _trainLoader = TargetName != null ? new DataLoader<FType>(TargetName) : new DataLoader<FType>(); _testLoader = TargetName != null ? new DataLoader<FType>(TargetName) : new DataLoader<FType>(); if (!File.Exists(TrainPath)) { Logger.Log("train file " + TrainPath + " not found"); throw new FileNotFoundException("", TrainPath); } if (!File.Exists(TestPath)) { Logger.Log("test file " + TestPath + " not found"); throw new FileNotFoundException("", TestPath); } // loading train file _trainLoader.IsLoadForLearning = true; _trainLoader.AddIdsString(IdName); _trainLoader.Load(TrainPath); foreach (var key in _trainLoader.TargetProb.Keys) Logger.Log("prob[" + key.ToString("F0") + "] = " + _trainLoader.TargetProb[key].ToString("F06")); Logger.Log("Outliers to drop: " + (int)(_trainLoader.TotalDataLines * OutliersPrct)); // loading test file foreach (var id in _trainLoader.Ids.Keys) // the same id's _testLoader.AddIdColumn(id); foreach (var col in _trainLoader.SkippedColumns.Keys) // the same columns _testLoader.AddSkipColumn(col); // loading test file _testLoader.Load(TestPath); ModifyData(); }
static void Main(string[] args) { if (args.Length < 4 || args.Length > 4) { Logger.Log("usage: program.exe <train.csv> <conf.csv> <id> <target_name>"); return; } string dataPath = args[0]; string confPath = args[1]; string id = args[2]; string target = args[3]; Logger.Log("data: " + dataPath); Logger.Log("conf : " + confPath); Logger.Log("id : " + id); Logger.Log("target : " + target); try { var fmgr = new FactorManager(); fmgr.Load(confPath, target); fmgr.TargDep = 10; fmgr.FactorDep = 100; fmgr.SelectFactors(); var cols = fmgr.VisibleFactors.ToArray(); //_loader.MaxRowsLoaded = 10000; _loader.AddTargetColumn(target); _loader.AddIdColumn(id); _loader.CollectDistrStat = true; _loader.Load(dataPath); var statDict = new Dictionary <TupleData, Dictionary <TupleData, StatItem> >(); // collecting stats int idx = 0; int n = 4; var iter = new CombinationIterator(cols, n); while (iter.MoveNext()) { idx++; var cval = iter.Current; var ftuple = new TupleData(cval); statDict.Add(ftuple, new Dictionary <TupleData, StatItem>()); foreach (var row in _loader.Rows) { var vtuple = CreateValueTuple(cval, row); if (!statDict[ftuple].ContainsKey(vtuple)) { statDict[ftuple].Add(vtuple, new StatItem()); } if (row.Target <= 1) { statDict[ftuple][vtuple].Count++; statDict[ftuple][vtuple].Targets += (int)row.Target; } } foreach (var t in statDict[ftuple].Keys) { statDict[ftuple][t].TargetProb = statDict[ftuple][t].Targets / (double)statDict[ftuple][t].Count; } Logger.Log(ftuple + " done;"); } // creating modified file using (var sw = new StreamWriter(new FileStream(dataPath + "_cat.csv", FileMode.Create, FileAccess.Write))) { idx = 0; sw.WriteLine(CreateHeader(cols, n)); sw.Flush(); double defProb = (double)_loader.TargetStat[1] / (_loader.TargetStat[1] + _loader.TargetStat[0]); foreach (var row in _loader.Rows) { idx++; var sb = new StringBuilder(); iter = new CombinationIterator(cols, n); sb.Append(row.Id); while (iter.MoveNext()) { var cval = iter.Current; var ftuple = new TupleData(cval); var t = CreateValueTuple(cval, row); double prob = statDict[ftuple].ContainsKey(t) ? statDict[ftuple][t].TargetProb : defProb; sb.Append(";" + prob.ToString("F05")); } sb.Append(";" + row.Target); sw.WriteLine(sb); if (idx % 12345 == 0) { Logger.Log(idx + " lines writed;"); sw.Flush(); } } Logger.Log(idx + " lines writed; done;"); } } catch (Exception e) { Logger.Log(e); } }
public override void LoadData() { string trainPath=""; string testPath=""; string ids=""; string target=""; _trainPath = trainPath; _testPath = testPath; _target = target; _trainLoader = _target != null ? new DataLoader<FType>(_target) : new DataLoader<FType>(); _testLoader = _target != null ? new DataLoader<FType>(_target) : new DataLoader<FType>(); if (!File.Exists(_trainPath)) { Logger.Log("train file " + _trainPath + " not found"); throw new FileNotFoundException("", _trainPath); } if (!File.Exists(_trainPath)) { Logger.Log("test file " + _testPath + " not found"); throw new FileNotFoundException("", _testPath); } // loading train file _trainLoader.AddIdsString(ids); _trainLoader.ProceedRowFunc = ProceedRow; _trainLoader.Load(_trainPath); _trainProblem = _problemCreator.CreateProblem(); // loading test file foreach (var id in _trainLoader.Ids.Keys) // the same id's _testLoader.AddIdColumn(id); foreach (var col in _trainLoader.SkippedColumns.Keys) // the same columns _testLoader.AddSkipColumn(col); // loading _testLoader.Load(_testPath); _testDataDict = new Dictionary<string, List<double[]>>(); // тестовые данные: id -> список строк на данный id _resultDict = new Dictionary<string, int>(); // результат тестовых данных: id -> target // модифицируем тестовые данные foreach (var row in _testLoader.Rows) { // сохраняем результат if (!_resultDict.ContainsKey(row.Id)) _resultDict.Add(row.Id, Convert.ToInt32(row.Target)); // сохраняем ответ из бюро var txy = new double[_testLoader.NVars]; for (int k = 0; k < _testLoader.NVars; k++) { txy[k] = row.Coeffs[k]; } if (!_testDataDict.ContainsKey(row.Id)) _testDataDict.Add(row.Id, new List<double[]>()); _testDataDict[row.Id].Add(txy); } }