Balance a CSV file. This utility is useful when you have several an unbalanced training set. You may have a large number of one particular class, and many fewer elements of other classes. This can hinder many Machine Learning methods. This class can be used to balance the data. Obviously this class cannot generate data. You must request how many items you want per class. Some classes will have lower than this number if they were already below the specified amount. Any class above this amount will be trimmed to that amount.
Наследование: Encog.App.Analyst.CSV.Basic.BasicFile
Пример #1
0
        public void TestBalanceCSVNoHeaders()
        {
            GenerateTestFile(false);
            var norm = new BalanceCSV();
            norm.Analyze(InputName, false, CSVFormat.English);
            norm.Process(OutputName, 1, 2);

            var tr = new StreamReader(OutputName.ToString());
            Assert.AreEqual("\"field:0\",\"field:1\"", tr.ReadLine());
            Assert.AreEqual("one,1", tr.ReadLine());
            Assert.AreEqual("two,1", tr.ReadLine());
            Assert.AreEqual("four,2", tr.ReadLine());
            Assert.AreEqual("five,2", tr.ReadLine());
            Assert.AreEqual("six,3", tr.ReadLine());
            Assert.AreEqual(2, norm.Counts["1"]);
            Assert.AreEqual(2, norm.Counts["2"]);
            Assert.AreEqual(1, norm.Counts["3"]);
            tr.Close();

            InputName.Delete();
            OutputName.Delete();
        }
        /// <inheritdoc />
        public override sealed bool ExecuteCommand(String args)
        {
            // get filenames
            String sourceID = Prop.GetPropertyString(
                ScriptProperties.BalanceConfigSourceFile);
            String targetID = Prop.GetPropertyString(
                ScriptProperties.BalanceConfigTargetFile);

            EncogLogging.Log(EncogLogging.LevelDebug, "Beginning balance");
            EncogLogging.Log(EncogLogging.LevelDebug, "source file:" + sourceID);
            EncogLogging.Log(EncogLogging.LevelDebug, "target file:" + targetID);

            FileInfo sourceFile = Script.ResolveFilename(sourceID);
            FileInfo targetFile = Script.ResolveFilename(targetID);

            // get other config data
            int countPer = Prop.GetPropertyInt(
                ScriptProperties.BalanceConfigCountPer);
            String targetFieldStr = Prop.GetPropertyString(
                ScriptProperties.BalanceConfigBalanceField);
            DataField targetFieldDf = Analyst.Script.FindDataField(
                targetFieldStr);
            if (targetFieldDf == null)
            {
                throw new AnalystError("Can't find balance target field: "
                                       + targetFieldStr);
            }
            if (!targetFieldDf.Class)
            {
                throw new AnalystError("Can't balance on non-class field: "
                                       + targetFieldStr);
            }

            int targetFieldIndex = Analyst.Script
                                          .FindDataFieldIndex(targetFieldDf);

            // mark generated
            Script.MarkGenerated(targetID);

            // get formats
            CSVFormat inputFormat = Script.DetermineFormat();
            CSVFormat outputFormat = Script.DetermineFormat();

            // prepare to normalize
            var balance = new BalanceCSV {Script = Script};
            Analyst.CurrentQuantTask = balance;
            balance.Report = new AnalystReportBridge(Analyst);

            bool headers = Script.ExpectInputHeaders(sourceID);
            balance.Analyze(sourceFile, headers, inputFormat);
            balance.ProduceOutputHeaders = true;
            balance.Process(targetFile, targetFieldIndex, countPer);
            Analyst.CurrentQuantTask = null;
            return balance.ShouldStop();
        }
Пример #3
0
 public override sealed bool ExecuteCommand(string args)
 {
     string str2;
     FileInfo info;
     FileInfo info2;
     int propertyInt;
     string str3;
     DataField field;
     int num2;
     CSVFormat format;
     CSVFormat format2;
     BalanceCSV ecsv;
     bool flag;
     string propertyString = base.Prop.GetPropertyString("BALANCE:CONFIG_sourceFile");
     if ((((uint) num2) - ((uint) num2)) <= uint.MaxValue)
     {
         str2 = base.Prop.GetPropertyString("BALANCE:CONFIG_targetFile");
         EncogLogging.Log(0, "Beginning balance");
         if (0 != 0)
         {
             goto Label_011C;
         }
         EncogLogging.Log(0, "source file:" + propertyString);
         if (0 != 0)
         {
             goto Label_0143;
         }
         EncogLogging.Log(0, "target file:" + str2);
         info = base.Script.ResolveFilename(propertyString);
     }
     goto Label_01C1;
     Label_0031:
     ecsv.OutputFormat = format2;
     ecsv.ProduceOutputHeaders = true;
     ecsv.Process(info2, num2, propertyInt);
     if (2 == 0)
     {
         goto Label_0153;
     }
     base.Analyst.CurrentQuantTask = null;
     if ((((uint) propertyInt) + ((uint) flag)) <= uint.MaxValue)
     {
         goto Label_0299;
     }
     if ((((uint) num2) - ((uint) propertyInt)) <= uint.MaxValue)
     {
         goto Label_01C1;
     }
     goto Label_0191;
     Label_00CB:
     format2 = base.Script.DetermineOutputFormat();
     BalanceCSV ecsv2 = new BalanceCSV();
     if ((((uint) propertyInt) | 0xff) != 0)
     {
         ecsv2.Script = base.Script;
         ecsv = ecsv2;
         base.Analyst.CurrentQuantTask = ecsv;
         ecsv.Report = new AnalystReportBridge(base.Analyst);
         flag = base.Script.ExpectInputHeaders(propertyString);
         ecsv.Analyze(info, flag, format);
         if ((((uint) propertyInt) + ((uint) flag)) < 0)
         {
             goto Label_00CB;
         }
     }
     goto Label_0031;
     Label_011C:
     format = base.Script.DetermineInputFormat(propertyString);
     if (0 == 0)
     {
         goto Label_00CB;
     }
     goto Label_0031;
     Label_0143:
     if (!field.Class)
     {
         throw new AnalystError("Can't balance on non-class field: " + str3);
     }
     if (0 == 0)
     {
         num2 = base.Analyst.Script.FindDataFieldIndex(field);
         base.Script.MarkGenerated(str2);
         goto Label_011C;
     }
     goto Label_0031;
     Label_0153:
     if (field == null)
     {
         throw new AnalystError("Can't find balance target field: " + str3);
     }
     goto Label_0143;
     Label_0191:
     if (((uint) num2) < 0)
     {
         goto Label_0299;
     }
     goto Label_0153;
     if (3 != 0)
     {
         goto Label_0143;
     }
     Label_01C1:
     if ((((uint) propertyInt) & 0) != 0)
     {
         goto Label_00CB;
     }
     info2 = base.Script.ResolveFilename(str2);
     propertyInt = base.Prop.GetPropertyInt("BALANCE:CONFIG_countPer");
     str3 = base.Prop.GetPropertyString("BALANCE:CONFIG_balanceField");
     field = base.Analyst.Script.FindDataField(str3);
     if (((uint) flag) >= 0)
     {
         goto Label_0191;
     }
     goto Label_0143;
     Label_0299:
     return ecsv.ShouldStop();
 }