Example #1
0
        public void AddSkipColumn(string col)
        {
            string ncol = col.ToLower();

            if (string.IsNullOrWhiteSpace(ncol))
            {
                return;
            }
            if (!SkippedColumns.ContainsKey(ncol))
            {
                SkippedColumns.Add(ncol, 1);
            }
        }
Example #2
0
        public void RemoveSkipColumn(string col)
        {
            string ncol = col.ToLower();

            if (string.IsNullOrWhiteSpace(ncol))
            {
                return;
            }
            if (SkippedColumns.ContainsKey(ncol))
            {
                SkippedColumns.Remove(ncol);
            }
        }
Example #3
0
        public void Load(string filename)
        {
            try
            {
                if (IsLoadForLearning)
                {
                    TotalDataLines = GetDataLinesCount(filename);
                }


                using (var sr = new StreamReader(new FileStream(filename, FileMode.Open, FileAccess.Read), Encoding.GetEncoding(1251)))
                {
                    int    idx  = 0;
                    int    nrow = 0;
                    string nextline;
                    int    classNum = 0;
                    while ((nextline = sr.ReadLine()) != null)
                    {
                        idx++;
                        if (string.IsNullOrWhiteSpace(nextline))
                        {
                            continue;
                        }
                        string[] blocks = GetStringBlocks(nextline);

                        // header row
                        if (idx == 1)
                        {
                            for (int i = 0; i < blocks.Length; i++)
                            {
                                string cname = blocks[i]; // column name

                                if (!FileColumnByIdx.ContainsKey(i))
                                {
                                    FileColumnByIdx.Add(i, cname);
                                }
                                if (!FileIdxByColumn.ContainsKey(cname))
                                {
                                    FileIdxByColumn.Add(cname, i);
                                }
                                else
                                {
                                    Logger.Log("duplicate column name: " + cname + ", exiting");
                                }
                            }

                            if (TargetName != null)
                            {
                                if (!FileIdxByColumn.ContainsKey(TargetName))
                                {
                                    Logger.Log("Warning: data (" + filename + ") doesn't have a target (" + TargetName + ") column");
                                    //break;
                                }
                                else
                                {
                                    TargetIdx = FileIdxByColumn[TargetName]; // target column index
                                }
                            }

                            // id columns
                            foreach (var iname in Ids.Keys)
                            {
                                if (!FileIdxByColumn.ContainsKey(iname))
                                {
                                    throw new InvalidDataException("id column '" + iname + "' not found");
                                }
                                int sidx = FileIdxByColumn[iname];
                                if (!_idIdx.ContainsKey(sidx))
                                {
                                    _idIdx.Add(sidx, 1);
                                }
                            }
                            if (Ids.Count > 0)
                            {
                                IdName = GetStringId(Ids.Keys.ToArray());
                            }

                            // skip columns
                            var toDel = (from t in SkippedColumns.Keys where !FileIdxByColumn.ContainsKey(t) select t).ToList();
                            toDel.ForEach(c => SkippedColumns.Remove(c));

                            // count of variables except skipped
                            NVars = FileColumnByIdx.Count - SkippedColumns.Count;

                            continue;
                        }

                        // data row
                        nrow++;

                        if (blocks.Length > FileIdxByColumn.Count)
                        {
                            Logger.Log("error parsing row #" + nrow);
                            continue;
                        }

                        if (RandomGen.GetDouble() >= LoadFactor)
                        {
                            continue;
                        }

                        var row = new DataRow <T>();

                        // parse target
                        if (TargetName != null && TargetIdx >= 0)
                        {
                            row.Target = ParseValue(blocks[TargetIdx]);
                        }

                        // creating composite id
                        row.Id = GetStringId(blocks);
                        if (string.IsNullOrEmpty(row.Id))
                        {
                            row.Id = nrow.ToString();                               //using row_number if ids not set
                        }
                        // save stats for target value
                        if (!TargetStat.ContainsKey(row.Target))
                        {
                            TargetStat.Add(row.Target, 0);
                        }
                        TargetStat[row.Target]++;   // count by target

                        // class id by target
                        if (!ClassNumByValue.ContainsKey(row.Target))
                        {
                            ClassNumByValue.Add(row.Target, classNum++);
                        }

                        // target by class id
                        if (!ValueByClassNum.ContainsKey(ClassNumByValue[row.Target]))
                        {
                            ValueByClassNum.Add(ClassNumByValue[row.Target], row.Target);
                        }


                        // --------------------------- loading for learning -------------------------------
                        if (IsLoadForLearning)
                        {
                            if (LearnRows == null)
                            {
                                LearnRows = new T[TotalDataLines, NVars + 1]; // all variables +1 for target
                            }
                            for (int i = 0, k = 0; i < blocks.Length; i++)
                            {
                                string cval    = blocks[i];
                                string colname = FileColumnByIdx[i];
                                if (SkippedColumns.ContainsKey(colname))
                                {
                                    continue;
                                }

                                T pval = ParseValue(cval);
                                LearnRows[nrow - 1, k++] = pval;
                                SaveVarDistr(colname, pval);
                            }
                            LearnRows[nrow - 1, NVars] = row.Target;
                        }
                        else
                        {
                            // --------------------------- loading for analyse -----------------------------------
                            var carray = new T[NVars];

                            for (int i = 0, k = 0; i < blocks.Length; i++)
                            {
                                string cval = blocks[i];
                                if (FileColumnByIdx.ContainsKey(i))
                                {
                                    string colname = FileColumnByIdx[i];
                                    if (SkippedColumns.ContainsKey(colname))
                                    {
                                        continue;
                                    }

                                    if (!RowColumnByIdx.ContainsKey(k))
                                    {
                                        RowColumnByIdx.Add(k, colname);
                                    }

                                    if (!RowIdxByColumn.ContainsKey(colname))
                                    {
                                        RowIdxByColumn.Add(colname, k);
                                    }

                                    T pval = ParseValue(cval);
                                    carray[k] = pval;
                                    k++;
                                    SaveVarDistr(colname, pval);
                                }
                                else
                                {
                                    Logger.Log("error parsing id=" + row.Id);
                                }
                            }

                            row.Values = carray;
                            if (ProceedRowFunc == null) // don't save row in case of ProceedRowFunc not null
                            {
                                Rows.Add(row);
                            }
                            else
                            {
                                ProceedRowFunc(row);
                            }

                            TotalDataLines++;
                        }

                        if (idx % 12345 == 0)
                        {
                            Logger.Log(idx + " lines loaded");
                        }
                        if (MaxRowsLoaded != 0 && idx > MaxRowsLoaded)
                        {
                            break;
                        }
                    }

                    GetTargetProbs();
                    Logger.Log((idx - 1) + " lines loaded;");
                }
            }
            catch (Exception e)
            {
                Logger.Log(e);
                throw;
            }
        }