public void AddSkipColumn(string col) { string ncol = col.ToLower(); if (string.IsNullOrWhiteSpace(ncol)) { return; } if (!SkippedColumns.ContainsKey(ncol)) { SkippedColumns.Add(ncol, 1); } }
public void RemoveSkipColumn(string col) { string ncol = col.ToLower(); if (string.IsNullOrWhiteSpace(ncol)) { return; } if (SkippedColumns.ContainsKey(ncol)) { SkippedColumns.Remove(ncol); } }
public void Load(string filename) { try { if (IsLoadForLearning) { TotalDataLines = GetDataLinesCount(filename); } using (var sr = new StreamReader(new FileStream(filename, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(1251))) { int idx = 0; int nrow = 0; string nextline; int classNum = 0; while ((nextline = sr.ReadLine()) != null) { idx++; if (string.IsNullOrWhiteSpace(nextline)) { continue; } string[] blocks = GetStringBlocks(nextline); // header row if (idx == 1) { for (int i = 0; i < blocks.Length; i++) { string cname = blocks[i]; // column name if (!FileColumnByIdx.ContainsKey(i)) { FileColumnByIdx.Add(i, cname); } if (!FileIdxByColumn.ContainsKey(cname)) { FileIdxByColumn.Add(cname, i); } else { Logger.Log("duplicate column name: " + cname + ", exiting"); } } if (TargetName != null) { if (!FileIdxByColumn.ContainsKey(TargetName)) { Logger.Log("Warning: data (" + filename + ") doesn't have a target (" + TargetName + ") column"); //break; } else { TargetIdx = FileIdxByColumn[TargetName]; // target column index } } // id columns foreach (var iname in Ids.Keys) { if (!FileIdxByColumn.ContainsKey(iname)) { throw new InvalidDataException("id column '" + iname + "' not found"); } int sidx = FileIdxByColumn[iname]; if (!_idIdx.ContainsKey(sidx)) { _idIdx.Add(sidx, 1); } } if (Ids.Count > 0) { IdName = GetStringId(Ids.Keys.ToArray()); } // skip columns var toDel = (from t in SkippedColumns.Keys where !FileIdxByColumn.ContainsKey(t) select t).ToList(); toDel.ForEach(c => SkippedColumns.Remove(c)); // count of variables except skipped NVars = FileColumnByIdx.Count - SkippedColumns.Count; continue; } // data row nrow++; if (blocks.Length > FileIdxByColumn.Count) { Logger.Log("error parsing row #" + nrow); continue; } if (RandomGen.GetDouble() >= LoadFactor) { continue; } var row = new DataRow <T>(); // parse target if (TargetName != null && TargetIdx >= 0) { row.Target = ParseValue(blocks[TargetIdx]); } // creating composite id row.Id = GetStringId(blocks); if (string.IsNullOrEmpty(row.Id)) { row.Id = nrow.ToString(); //using row_number if ids not set } // save stats for target value if (!TargetStat.ContainsKey(row.Target)) { TargetStat.Add(row.Target, 0); } TargetStat[row.Target]++; // count by target // class id by target if (!ClassNumByValue.ContainsKey(row.Target)) { ClassNumByValue.Add(row.Target, classNum++); } // target by class id if (!ValueByClassNum.ContainsKey(ClassNumByValue[row.Target])) { ValueByClassNum.Add(ClassNumByValue[row.Target], row.Target); } // --------------------------- loading for learning ------------------------------- if (IsLoadForLearning) { if (LearnRows == null) { LearnRows = new T[TotalDataLines, NVars + 1]; // all variables +1 for target } for (int i = 0, k = 0; i < blocks.Length; i++) { string cval = blocks[i]; string colname = FileColumnByIdx[i]; if (SkippedColumns.ContainsKey(colname)) { continue; } T pval = ParseValue(cval); LearnRows[nrow - 1, k++] = pval; SaveVarDistr(colname, pval); } LearnRows[nrow - 1, NVars] = row.Target; } else { // --------------------------- loading for analyse ----------------------------------- var carray = new T[NVars]; for (int i = 0, k = 0; i < blocks.Length; i++) { string cval = blocks[i]; if (FileColumnByIdx.ContainsKey(i)) { string colname = FileColumnByIdx[i]; if (SkippedColumns.ContainsKey(colname)) { continue; } if (!RowColumnByIdx.ContainsKey(k)) { RowColumnByIdx.Add(k, colname); } if (!RowIdxByColumn.ContainsKey(colname)) { RowIdxByColumn.Add(colname, k); } T pval = ParseValue(cval); carray[k] = pval; k++; SaveVarDistr(colname, pval); } else { Logger.Log("error parsing id=" + row.Id); } } row.Values = carray; if (ProceedRowFunc == null) // don't save row in case of ProceedRowFunc not null { Rows.Add(row); } else { ProceedRowFunc(row); } TotalDataLines++; } if (idx % 12345 == 0) { Logger.Log(idx + " lines loaded"); } if (MaxRowsLoaded != 0 && idx > MaxRowsLoaded) { break; } } GetTargetProbs(); Logger.Log((idx - 1) + " lines loaded;"); } } catch (Exception e) { Logger.Log(e); throw; } }