public FrmCategoricalSplitProperties(Node _node, Predictor _pred) { node = _node; pred = _pred; List<string> caseLst; InitializeComponent(); //If a split could not be defined, leave the 1st on the left and the rest on the right if (pred.ChildrenGroups == null) { //Set the possible children MessageBox.Show("This split has not been computed", "Warning!"); ValueGroup valueGroup = new ValueGroup(_pred, 2); pred.ChildrenGroups = valueGroup; lbLeft.Items.Add(pred.ValueSd.Keys[0]); for (int i = 1; i < pred.DistinctValuesCount; ++i) { lbRight.Items.Add(pred.ValueSd.Keys[i]); } checks(); return; } caseLst = pred.ChildrenGroups.ValueGroupLst[0]; for (int i = 0; i < caseLst.Count; ++i) { lbLeft.Items.Add(caseLst[i]); } caseLst = pred.ChildrenGroups.ValueGroupLst[1]; for (int i = 0; i < caseLst.Count; ++i) { lbRight.Items.Add(caseLst[i]); } checks(); }
public ValueGroup(Predictor _pred, int _groupCount) { pred=_pred; List<string> ValueLst; groupCount = _groupCount; ValueGroupLst = new List<List<string>>(groupCount); for (int i = 0; i < groupCount; ++i) { ValueLst = new List<string>(); ValueGroupLst.Add(ValueLst); } }
/// Sets the best p.SplitHyperplane and p.Gain /// <summary> /// Look for the local minimum by fixing all the coeficients except one and then vary it /// </summary> public static double MinImpMvGreed(NodeTargetCategorical n, Predictor p) { bool hadOverallImprovement=false; //List<double> bestCoefLst = new List<double>(); //List<double> testCoefLst = new List<double>(); // VariableToBeTestedCoodinates SortedList<string, int> leftNode = new SortedList<string, int>(); SortedList<string, int> rightNode = new SortedList<string, int>(); List<int> nonOptimisedVariableLst = new List<int>(); //Stores the index of MV predictors that have been used List<int> optimisedVariableLst = new List<int>(); //Stores the index of MV predictors that have been used // starts with the best split index List<double> azLst = new List<double>(); n.C = 0; // (= best split coordinate) Constant of the equation A1X1 + A2X2 + C = V [0] normalised [1] original value double lowestImp = n.ImpBestUniSplit; double bestCoef=-1; double bestMvGain; double bestUniGain; int bestSplitIdx=-1; SortedList<double, int> orderedNominalGain = new SortedList<double, int>(); // value, index or PredictorLst SortedList<double, int> orderedNumericGain = new SortedList<double, int>(); // value, index or PredictorLst FrmGraph fg = new FrmGraph(); bestUniGain = (n.Imp - n.ImpBestUniSplit) * 100 / n.Imp; bestUniGain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; Def.LogMessage += "Starting MinImpMvGreed, smallest univariate imp= " + lowestImp + " bestUniSplit gain " + n.BestSplit.Gain + " should be equal bestUniGain " + bestUniGain + Environment.NewLine; for(int i=0; i < n.PredMvLst.Count; ++i){ n.PredMvLst[i].Optimised = false; n.PredMvLst[i].Coef = Def.AbsentCoefficientValue; nonOptimisedVariableLst.Add(i); } if (n.BestSplit.Variable.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Continuous) { //Sets the properties of the best univariate split #region nonOptimisedVariableLst.Remove(n.BestSplit.PredMvBase); optimisedVariableLst.Add(n.BestSplit.PredMvBase); n.PredMvLst[n.BestSplit.PredMvBase].Coef = 1; n.PredMvLst[n.BestSplit.PredMvBase].Optimised = true; n.C = -Fcn.NormP1(n.BestSplit.SplitValue, n.BestSplit.LowerNumber, n.BestSplit.HigherNumber); // c = 0; #endregion }else{//Nominal List<string> caseLst; n.C = 0; //Sets the coefficients of the values on the left with the Def.PresentCoefficientValue caseLst = n.PredictorLst[n.BestSplit.PredictorLstIdx].ChildrenGroups.ValueGroupLst[0]; for (int i = 0; i < caseLst.Count; ++i) { //Not sure if it is right MessageBox.Show("didn't undestand bellow"); n.PredMvLst[n.BestSplit.PredMvBase + n.PredictorLst[n.BestSplit.PredictorLstIdx].ValueSd.IndexOfKey(caseLst[i])].Coef = Def.PresentCoefficientValue; } //If the best split is nominal, then all the column there describe it are already optmised for (int i = 0; i < n.PredictorLst[n.BestSplit.PredictorLstIdx].DistinctValuesCount; ++i) { nonOptimisedVariableLst.Remove(n.BestSplit.PredMvBase + i); optimisedVariableLst.Add(n.BestSplit.PredMvBase + i); n.PredMvLst[n.BestSplit.PredMvBase + i].Optimised = true; } } //if (n.BestSplit.Gain == 100) { // Def.LogMessage += "Nothing to be done, univariate gain equals 100%" + Environment.NewLine; // n.MvTb.DataFill(); // DataValidate(n, c); // n.MvTb.DataEmpty(); // return p.Gain; //} double[] r= new double[2]; while(nonOptimisedVariableLst.Count > 0){ //while(optimisedVariableLst.Count < 11){ hadOverallImprovement=false; for (int nopVarIdx = 0; nopVarIdx < nonOptimisedVariableLst.Count; ++nopVarIdx) { ////Gets lowestCatSplitImp, catSplitImp; if (n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Variable.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Categorical) { r = BestGainMvCategorical(n, n.C, nonOptimisedVariableLst[nopVarIdx], optimisedVariableLst); if (r[1] < lowestImp) { bestCoef = r[0]; bestSplitIdx = nonOptimisedVariableLst[nopVarIdx]; lowestImp = r[1]; hadOverallImprovement=true; } } //Gets lowestNumSplitImp, numSplitImp; else { r = BestGainMvNumeric(n, n.C, nonOptimisedVariableLst[nopVarIdx], optimisedVariableLst, fg); if (r[1] < lowestImp) { bestCoef = r[0]; bestSplitIdx = nonOptimisedVariableLst[nopVarIdx]; lowestImp = r[1]; hadOverallImprovement=true; } } if (n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].FieldSpan > 0) Def.LogMessage += " nopVarIdx " + nonOptimisedVariableLst[nopVarIdx] + " Name " + n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Variable.Name + "(" + n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Offset + ")"; else Def.LogMessage += " nopVarIdx " + nonOptimisedVariableLst[nopVarIdx] + " Name " + n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Variable.Name; Def.LogMessage += " Coefficient " + r[0] + " SplitImp " + r[1] + Environment.NewLine; }//End for if (hadOverallImprovement) { if (!nonOptimisedVariableLst.Contains(bestSplitIdx)) { Def.LogMessage += "Error couldn't find bestSplitIdx " + bestSplitIdx + " inside nonOptimisedVariableLst " + Environment.NewLine; MessageBox.Show("Error couldn't find bestSplitIdx " + bestSplitIdx + " inside nonOptimisedVariableLst "); return -1; } n.PredMvLst[bestSplitIdx].Coef = bestCoef; n.PredMvLst[bestSplitIdx].Optimised = true; nonOptimisedVariableLst.Remove(bestSplitIdx); optimisedVariableLst.Add(bestSplitIdx); Def.LogMessage += "Best variable " + n.PredMvLst[bestSplitIdx].Variable.Name + n.PredMvLst[bestSplitIdx].Offset + " Coef " + bestCoef + " lowestCatSplitImp " + lowestImp + "++++++++++++++++++++" + Environment.NewLine; } else { //No improvement Def.LogMessage += "No more improvement " + (optimisedVariableLst.Count) + " variables of " + n.PredMvLst.Count + " are being combined on the purity function -------------------" + Environment.NewLine; foreach (int predMvIdx in nonOptimisedVariableLst) { optimisedVariableLst.Add(predMvIdx); n.PredMvLst[predMvIdx].Optimised = true; } nonOptimisedVariableLst.Clear(); } if (lowestImp==0) { Def.LogMessage += "Node is now 100% pure stopping greed seach " + Environment.NewLine; break; } }//End while n.ImpBestMvSplit = lowestImp; p.Gain = n.BestSplit.Gain; bestMvGain = (n.Imp - n.ImpBestMvSplit) * 100 / n.Imp; bestMvGain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; bestUniGain = (n.Imp - n.ImpBestUniSplit) * 100 / n.Imp; bestUniGain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; string coefStr = ""; for (int i = 0; i < n.PredMvLst.Count; ++i) { if (n.PredMvLst[i].Variable.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Continuous) coefStr += n.PredMvLst[i].Coef + "(" + n.PredMvLst[i].Variable.Name + ") "; else coefStr += n.PredMvLst[i].Coef + "(" + n.PredMvLst[i].Variable.Name + n.PredMvLst[i].Offset + ") "; } coefStr += "c = " + n.C; //Why UniGain is sometimes different to p.Gain Def.LogMessage += "MvGain: " + bestMvGain + " UniGain: " + bestUniGain + " P.Gain: " + p.Gain + " MvImp: " + lowestImp + " UniImp: " + n.ImpBestUniSplit + " NdImp: " + n.Imp + Environment.NewLine + coefStr + Environment.NewLine; if (!Def.ExperimentRunning) { for (int y = 0; y < n.Table.RowCount; ++y) { fg.x0Lst.Add((float) n.PredMvLst[0].X(y)); fg.x1Lst.Add((float) n.PredMvLst[1].X(y)); if (n.MvTb.Data.TC[y].ToLower() == "n") { fg.n.Add(y); } } fg.a0=(float) n.PredMvLst[0].Coef; fg.a1 = (float) n.PredMvLst[1].Coef; fg.c = (float) n.C; fg.N = n; fg.ABest = (float)bestCoef; fg.Show(); fg.Invalidate(); DataValidate(n, n.C); } //if auto n.Imp = return p.Gain; }
//Sets the best p.SplitValue p.Gain public static double MinImpCont(NodeTargetCategorical n, Predictor p) { int i, leftRowCount = 0, rightRowCount = 0; // int dfd; //delete double minImp, imp, lImp, rImp, instanceCount = n.Table.RowCount; lImp = rImp = minImp = imp = double.NaN; List<N3T> AvcLst; List<int> thresholdIndexLst; //Tries each partition: //AvcLst[i].N0 = Value of the dependent varible //AvcLst[i].N1 = Frequency of y //AvcLst[i].N2 = Total of distinct registries until that row //AvcLst[i].T = y string sql = @"SELECT ALL " + Def.DbBsTb + "." + p.Variable.Name + ", " + " count(" + Def.DbBsTb + "." + p.Variable.Name + "),0 , " + Def.DbBsTb + "." + Def.Schema.Target.Name + " " + "FROM " + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + " = " + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " " + " AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + "GROUP BY " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + Def.DbBsTb + "." + p.Variable.Name + " " + "ORDER BY " + Def.DbBsTb + "." + p.Variable.Name; AvcLst = Def.Db.GetN3TLst(sql); //N2 is the number of registries until a given row if (AvcLst.Count > 0) { AvcLst[0].N2 = AvcLst[0].N1; for (i = 1; i < AvcLst.Count; ++i) AvcLst[i].N2 = AvcLst[i - 1].N2 + AvcLst[i].N1; } thresholdIndexLst = Fcn.SetPossibleThresholdIndexLst(AvcLst); if (thresholdIndexLst.Count == 0) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } n.DescendentImpPreCalculated = new List<double>(2); n.DescendentImpPreCalculated.Add(0); n.DescendentImpPreCalculated.Add(0); p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; if (AvcLst.Count == 0) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } for (i = 0; i < thresholdIndexLst.Count; ++i) { lImp = ImpCont(0, thresholdIndexLst[i], AvcLst, out leftRowCount); rImp = ImpCont(thresholdIndexLst[i] + 1, AvcLst.Count - 1, AvcLst, out rightRowCount); if (Double.IsNaN(minImp) && leftRowCount >= Def.TreeMinNumberOfCasesPerNode && rightRowCount >= Def.TreeMinNumberOfCasesPerNode) { imp = (leftRowCount * lImp + rightRowCount * rImp) / (leftRowCount + rightRowCount); if (Double.IsNaN(imp) == false) { minImp = imp; p.SplitValue = AvcLst[thresholdIndexLst[i]].N0; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } } else { imp = (leftRowCount * lImp + rightRowCount * rImp) / (leftRowCount + rightRowCount); if (imp < minImp && !Double.IsNaN(imp) && leftRowCount >= Def.TreeMinNumberOfCasesPerNode && rightRowCount >= Def.TreeMinNumberOfCasesPerNode) { minImp = imp; p.SplitValue = AvcLst[thresholdIndexLst[i]].N0; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } } } if(Double.IsNaN(minImp)){ p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } List<double> nLst; sql = @" SELECT DISTINCT " + p.Variable.Name + " " + "FROM " + Def.DbBsTb + " , " + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + "=" + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + p.Variable.Name + " IS NOT NULL " + "ORDER BY " + p.Variable.Name + " ASC "; nLst = Def.Db.GetNumberLst(sql); //Finds the angle bisector of the slit if (p.SplitValue != nLst[0] && p.SplitValue != nLst[nLst.Count - 2]) { for (i = 1; i < nLst.Count - 2; ++i) { if (p.SplitValue == nLst[i]) { p.SplitValue = (nLst[i - 1] + nLst[i + 1]) / 2; break; } } } p.ImpUniMin = minImp; p.Gain = (n.Imp - minImp) * 100 / n.Imp; p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; return p.Gain; }
public static double MinImpCatProgressive(NodeTargetCategorical n, Predictor p) { //For some partitions gets the min Impiance int valueCount = p.DistinctValuesCount; double impBest= double.NaN; int i; int instanceCount = n.Table.RowCount; double imp = Double.NaN; double impIfValueGoesLeft = Double.NaN, impIfValueGoesRight = Double.NaN; List<NTT> nttLst; double impBeforePhase2; int improvementCode = 0; p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; if (valueCount < 2) { p.SplitStatus = Predictor.SplitStatusEnum.OnlyOneValueAvailable; p.Gain = 0; return 0; } List<string> lComb = new List<string>(valueCount); List<string> rComb = new List<string>(valueCount); SortedList<string, int> lPredVal = new SortedList<string, int>(); SortedList<string, int> rPredVal = new SortedList<string, int>(); string sql = @"SELECT ALL " + " count(*), " + Def.DbBsTb + "." + p.Variable.Name + ", " + Def.DbBsTb + "." + Def.Schema.Target.Name + " " + "FROM " + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + " = " + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + "GROUP BY " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + Def.DbBsTb + "." + p.Variable.Name + " "; nttLst = Def.Db.GetNTTLst(sql); lComb.Clear(); rComb.Clear(); lPredVal.Clear(); rPredVal.Clear(); lComb.Add(p.ValueSd.Keys[0]); // Adds the 1st value to the combination of the left node rComb.Add(p.ValueSd.Keys[1]); // Adds the 2nd value to the combination of the right node //Done only if the number of values is 2 if (valueCount == 2) { imp = fillPredVals(lPredVal, rPredVal, lComb, rComb, nttLst, n); p.ChildrenGroups.ValueGroupLst[0].Clear(); p.ChildrenGroups.ValueGroupLst[1].Clear(); if (Double.IsNaN(imp)) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.ImpUniMin = Double.NaN; p.Gain = Double.NaN; p.ChildrenGroups.ValueGroupLst[0].Clear(); p.ChildrenGroups.ValueGroupLst[1].Clear(); return 0; } foreach(string s in lComb){ p.ChildrenGroups.ValueGroupLst[0].Add(s); } foreach(string s in rComb){ p.ChildrenGroups.ValueGroupLst[1].Add(s); } n.ImpBestUniSplit = imp; p.Gain = (n.Imp - imp) * 100 / n.Imp; p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; return p.Gain; } //IF valueCount > 2 for (i = 2; i < valueCount; ++i) { //try to adding to the left lComb.Add(p.ValueSd.Keys[i]); impIfValueGoesLeft = fillPredVals(lPredVal, rPredVal, lComb, rComb, nttLst, n); //Changes the side rComb.Add(p.ValueSd.Keys[i]); lComb.RemoveAt(lComb.Count - 1); impIfValueGoesRight = fillPredVals(lPredVal, rPredVal, lComb, rComb, nttLst, n); if (!Double.IsNaN(impIfValueGoesLeft) && !Double.IsNaN(impIfValueGoesRight)) { if (impIfValueGoesLeft < impIfValueGoesRight) { imp = impIfValueGoesLeft; lComb.Add(p.ValueSd.Keys[i]); rComb.RemoveAt(rComb.Count - 1); } else { imp = impIfValueGoesRight; } } else { if (Double.IsNaN(impIfValueGoesLeft)) { imp = impIfValueGoesRight; } if (Double.IsNaN(impIfValueGoesRight)) { imp = impIfValueGoesLeft; lComb.Add(p.ValueSd.Keys[i]); rComb.RemoveAt(rComb.Count - 1); } } } impBest = imp; if (Def.ClfOptimisationLevelForCatSearch >= 1) { #region enhanced progressive phase 1 // 0 = no improvement // 1 = 1 // 2 = 2 // 3 = 3 //if (valueCount > 2) { // Final combinations for the two 1st values // 1- Removes 1st val from left and put in the right // 2- Removes 2nd val from right and send to left // 3- Puts back the 1st val to left // 0) Initial status // left 0xxxxxx // right 1xxxxxx // 1) // left xxxxxx // right 1xxxxxx0 // 2) // left xxxxxx1 // right xxxxxx0 // 3) // left xxxxxx10 // right xxxxxx // 1 lComb.RemoveAt(0); rComb.Add(p.ValueSd.Keys[0]); imp = fillPredVals(lPredVal, rPredVal, lComb, rComb, nttLst, n); if (imp < impBest) { improvementCode = 1; impBest = imp; } // 2 rComb.RemoveAt(0); lComb.Add(p.ValueSd.Keys[1]); imp = fillPredVals(lPredVal, rPredVal, lComb, rComb, nttLst, n); if (imp < impBest) { improvementCode = 2; impBest = imp; } // 3 rComb.RemoveAt(rComb.Count - 1); lComb.Add(p.ValueSd.Keys[0]); imp = fillPredVals(lPredVal, rPredVal, lComb, rComb, nttLst, n); if (imp < impBest) { improvementCode = 3; impBest = imp; } if (improvementCode == 0) { rComb.Add(lComb[lComb.Count - 2]); lComb.RemoveAt(lComb.Count - 2); } if (improvementCode == 1) { lComb.RemoveAt(lComb.Count - 1); lComb.RemoveAt(lComb.Count - 1); rComb.Add(p.ValueSd.Keys[0]); rComb.Add(p.ValueSd.Keys[1]); } if (improvementCode == 2) { lComb.RemoveAt(lComb.Count - 1); rComb.RemoveAt(rComb.Count - 1); lComb.Add(p.ValueSd.Keys[1]); rComb.Add(p.ValueSd.Keys[0]); } #endregion enhanced progressive phase 1 #region enhanced progressive phase 2 if (Def.ClfOptimisationLevelForCatSearch >= 2) { impBeforePhase2 = imp; int lCombCount = lComb.Count; for (int lidx = 0; lidx < lCombCount; ++lidx) { rComb.Add(lComb[0]); lComb.RemoveAt(0); imp = fillPredVals(lPredVal, rPredVal, lComb, rComb, nttLst, n); if (imp < impBest) { impBest = imp; } else { lComb.Add(rComb[rComb.Count - 1]); rComb.RemoveAt(rComb.Count - 1); } } int rCombCount = rComb.Count; for (int ridx = 0; ridx < rCombCount; ++ridx) { lComb.Add(rComb[0]); rComb.RemoveAt(0); imp = fillPredVals(lPredVal, rPredVal, lComb, rComb, nttLst, n); if (imp < impBest) { impBest = imp; } else { rComb.Add(lComb[lComb.Count - 1]); lComb.RemoveAt(lComb.Count - 1); } } }//If optimisation >=2 #endregion phase 2 }//If optimisation >=1 if (Double.IsNaN(impBest)) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.ImpUniMin = Double.NaN; p.Gain = Double.NaN; p.ChildrenGroups.ValueGroupLst[0].Clear(); p.ChildrenGroups.ValueGroupLst[1].Clear(); return 0; } p.ChildrenGroups.ValueGroupLst[0].Clear(); p.ChildrenGroups.ValueGroupLst[1].Clear(); foreach(string s in lComb){ p.ChildrenGroups.ValueGroupLst[0].Add(s); } foreach(string s in rComb){ p.ChildrenGroups.ValueGroupLst[1].Add(s); } imp = impBest; p.ImpUniMin = imp; p.Gain = (n.Imp - imp) * 100 / n.Imp; p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; return p.Gain; }
//It the number of variables is too large it can't compute //public static double KILLED_MinImpCatRandom(NodeTargetCategorical n, Predictor p) { // //For some partitions gets the min Impiance // int valueCount = p.DistinctValuesCount; // int pos, instanceI; // double partitionCount; // int c, i; // double bestPartition = 0, minImp = Double.NaN, imp = Double.NaN, lImp = Double.NaN, rImp = Double.NaN; // int nodeLfItemCount, nodeRtItemCount; // List<NTT> nttLst; // string binStr = ""; // p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; // n.DescendentImpPreCalculated = new List<double>(2); // n.DescendentImpPreCalculated.Add(0); // n.DescendentImpPreCalculated.Add(0); // List<string> lComb = new List<string>(valueCount); // List<string> rComb = new List<string>(valueCount); // SortedList<string, int> lPredVal = new SortedList<string, int>(); // SortedList<string, int> rPredVal = new SortedList<string, int>(); // string sql = // @"SELECT ALL " + // " count(*), " + // Def.DbBsTb + "." + p.Variable.Name + ", " + // Def.DbBsTb + "." + Def.Schema.Target.Name + " " + // "FROM " // + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + // "WHERE " // + Def.DbBsTb + "." + Def.DbTableIdName + " = " + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + // Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + // "GROUP BY " + // Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + // Def.DbBsTb + "." + p.Variable.Name + " "; // nttLst = Def.Db.GetNTTLst(sql); // int instanceCount = n.Table.RowCount; // partitionCount = Math.Pow(2, valueCount - 1) - 1; // double partitionCountMax = 0; // if (partitionCount > 4095) { // 4095 // partitionCountMax = 4095; // } else // partitionCountMax = partitionCount; // List<double> partLst = new List<double>((int)partitionCountMax); // i = 1; // //CHECK // for (int t = 0; t < partitionCountMax; ++t) { // partLst.Add((int)RNG.GetUniform(i, partitionCount)); // ++i; // } // for (i = 0; i < partitionCountMax; ++i) { // pos = 0; // binStr = Fcn.Decimal2BinaryStr(partLst[(int)i]); // lComb.Clear(); rComb.Clear(); // lPredVal.Clear(); rPredVal.Clear(); // nodeLfItemCount = nodeRtItemCount = 0; // for (c = 0; c < p.ValueSd.Count; ++c) { // if (binStr[binStr.Length - 1 - c] == '1') // lComb.Add(p.ValueSd.Keys[pos]); //if the 'case' is in, put it on the left // else // rComb.Add(p.ValueSd.Keys[pos]);//else, in the right side // ++pos; // } // for (instanceI = 0; instanceI < nttLst.Count; ++instanceI) { // foreach (string ls in lComb) { // if (nttLst[instanceI].T0 == ls) { // if (!lPredVal.ContainsKey(nttLst[instanceI].T1)) // lPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); // else // lPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; // nodeLfItemCount += (int)nttLst[instanceI].N; // break; // } // } // foreach (string rs in rComb) { // if (nttLst[instanceI].T0 == rs) { // if (!rPredVal.ContainsKey(nttLst[instanceI].T1)) // rPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); // else // rPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; // nodeRtItemCount += (int)nttLst[instanceI].N; // break; // } // } // } // if (nodeLfItemCount >= Def.TreeMinNumberOfCasesPerNode && nodeRtItemCount >= Def.TreeMinNumberOfCasesPerNode) { // lImp = ImpCat(lPredVal, nodeLfItemCount); // rImp = ImpCat(rPredVal, nodeRtItemCount); // if (Double.IsNaN(minImp)) { // bestPartition = partLst[(int)i]; // minImp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; // n.DescendentImpPreCalculated[0] = lImp; // n.DescendentImpPreCalculated[1] = rImp; // } else { // imp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; // if (imp < minImp) { // minImp = imp; // bestPartition = partLst[(int)i]; // n.DescendentImpPreCalculated[0] = lImp; // n.DescendentImpPreCalculated[1] = rImp; // } // } // } // } // if (bestPartition == 0) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // //Set the possible children // ValueGroup valueGroup = new ValueGroup(2); // p.ChildrenGroups = valueGroup; // pos = 0; // binStr = Fcn.Decimal2BinaryStr((double)bestPartition); // for (c = 0; c < p.ValueSd.Count; ++c) { // // if ((bestPartition & c) == c) // if (binStr[binStr.Length - 1 - c] == '1') // valueGroup.ValueGroupLst[0].Add(pos); //if the 'case' is in put it on the left // else // valueGroup.ValueGroupLst[1].Add(pos);//else, in the right side // ++pos; // } // n.ImpBestUniSplit = minImp; // p.Gain = (n.Imp - minImp) * 100 / n.Imp; // p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; // return p.Gain; //} //Sets the best p.Gain, valueGroup.ValueGroupLst[0] (left child) and valueGroup.ValueGroupLst[1] (right child) public static double MinImpCatFullSearch(NodeTargetCategorical n, Predictor p) { //For each possible partition gets the min Imp int valueCount = p.DistinctValuesCount; if (valueCount > Def.ClfMaxNumberOfValuesForFullSearch) { p.SplitStatus = Predictor.SplitStatusEnum.TooManyValuesToSearch; p.Gain = 0; return 0; } int partitionCount, pos, instanceI; uint c, i, bestPartition = 0; double minImp = Double.NaN, imp = Double.NaN, lImp = Double.NaN, rImp = Double.NaN; int nodeLfItemCount, nodeRtItemCount; List<NTT> nttLst; p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; n.DescendentImpPreCalculated = new List<double>(2); n.DescendentImpPreCalculated.Add(0); n.DescendentImpPreCalculated.Add(0); List<string> lComb = new List<string>(valueCount); List<string> rComb = new List<string>(valueCount); SortedList<string, int> lPredVal = new SortedList<string, int>(); SortedList<string, int> rPredVal = new SortedList<string, int>(); string sql = @"SELECT ALL " + " count(*), " + Def.DbBsTb + "." + p.Variable.Name + ", " + Def.DbBsTb + "." + Def.Schema.Target.Name + " " + "FROM " + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + " = " + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + "GROUP BY " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + Def.DbBsTb + "." + p.Variable.Name + " "; nttLst = Def.Db.GetNTTLst(sql); int instanceCount = n.Table.RowCount; partitionCount = (int)(Math.Pow(2, valueCount - 1) - 1); for (i = 1; i <= partitionCount; ++i) { //Enumerates all the possible partition but the empty pos = 0; lComb.Clear(); rComb.Clear(); lPredVal.Clear(); rPredVal.Clear(); nodeLfItemCount = nodeRtItemCount = 0; for (c = 1; c <= partitionCount + 1; c *= 2) { if ((i & c) == c) // i & c == c lComb.Add(p.ValueSd.Keys[pos]); //if the 'case' is in, put it on the left else rComb.Add(p.ValueSd.Keys[pos]);//else, in the right side ++pos; } for (instanceI = 0; instanceI < nttLst.Count; ++instanceI) { foreach (string ls in lComb) { if (nttLst[instanceI].T0 == ls) { if (!lPredVal.ContainsKey(nttLst[instanceI].T1)) lPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); else lPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; nodeLfItemCount += (int)nttLst[instanceI].N; break; } } foreach (string rs in rComb) { if (nttLst[instanceI].T0 == rs) { if (!rPredVal.ContainsKey(nttLst[instanceI].T1)) rPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); else rPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; nodeRtItemCount += (int)nttLst[instanceI].N; break; } } } if (nodeLfItemCount >= Def.TreeMinNumberOfCasesPerNode && nodeRtItemCount >= Def.TreeMinNumberOfCasesPerNode) { lImp = ImpCat(lPredVal, nodeLfItemCount); rImp = ImpCat(rPredVal, nodeRtItemCount); if (Double.IsNaN(minImp)) { bestPartition = i; minImp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } else { imp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; if (imp < minImp) { minImp = imp; bestPartition = i; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } } } } if (bestPartition == 0) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } //Set the possible children ValueGroup valueGroup = new ValueGroup(p, 2); p.ChildrenGroups = valueGroup; pos = 0; for (c = 1; c <= partitionCount + 1; c *= 2) { if ((bestPartition & c) == c) valueGroup.AddValueFromIndex(pos, 0); //if the 'case' is in put it on the left else valueGroup.AddValueFromIndex(pos, 1);//else, in the right side ++pos; } p.ImpUniMin = minImp; p.Gain = (n.Imp - minImp) * 100 / n.Imp; p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; return p.Gain; }
public FrmContinuousSplitProperties(Node _node, Predictor _pred) { InitializeComponent(); node = _node; pred = _pred; }
//Sets the best p.SplitValue p.Gain // public static double MinImpCont(NodeTargetCategorical n, Predictor p) { // int i, leftRowCount = 0, rightRowCount = 0; // // int dfd; //delete // double minImp, info, lImp, rImp, instanceCount = n.Table.RowCount; // lImp = rImp = minImp = info = double.NaN; // List<N3T> AvcLst; // List<int> thresholdIndexLst; // //Tries each partition: // //AvcLst[i].N0 = Value of the dependent varible // //AvcLst[i].N1 = Frequency of y // //AvcLst[i].N2 = Total of distinct registries until that row // //AvcLst[i].T = y // string sql = // @"SELECT ALL " + // Def.DbBsTb + "." + p.Variable.Name + ", " + // " count(" + Def.DbBsTb + "." + p.Variable.Name + "),0 , " + // Def.DbBsTb + "." + Def.Schema.Target.Name + " " + // "FROM " // + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + // "WHERE " // + Def.DbBsTb + "." + Def.DbTableIdName + " = " + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " " + // " AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + // "GROUP BY " + // Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + // Def.DbBsTb + "." + p.Variable.Name + " " + // "ORDER BY " + // Def.DbBsTb + "." + p.Variable.Name; // AvcLst = Def.Db.GetN3TLst(sql); // //N2 is the number of registries until a given row // if (AvcLst.Count > 0) { // AvcLst[0].N2 = AvcLst[0].N1; // for (i = 1; i < AvcLst.Count; ++i) // AvcLst[i].N2 = AvcLst[i - 1].N2 + AvcLst[i].N1; // } // thresholdIndexLst = Fcn.SetPossibleThresholdIndexLst(AvcLst); // if (thresholdIndexLst.Count == 0) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // n.DescendentImpPreCalculated = new List<double>(2); // n.DescendentImpPreCalculated.Add(0); // n.DescendentImpPreCalculated.Add(0); // p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; // if (AvcLst.Count == 0) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // for (i = 0; i < thresholdIndexLst.Count; ++i) { // lImp = ImpCont(0, thresholdIndexLst[i], AvcLst, out leftRowCount); // rImp = ImpCont(thresholdIndexLst[i] + 1, AvcLst.Count - 1, AvcLst, out rightRowCount); // if (Double.IsNaN(minImp) && leftRowCount >= Def.TreeMinNumberOfCasesPerNode && rightRowCount >= Def.TreeMinNumberOfCasesPerNode) { // info = (leftRowCount * lImp + rightRowCount * rImp) / (leftRowCount + rightRowCount); // if (Double.IsNaN(info) == false) { // minImp = info; // p.SplitValue = AvcLst[thresholdIndexLst[i]].N0; // n.DescendentImpPreCalculated[0] = lImp; // n.DescendentImpPreCalculated[1] = rImp; // } // } else { // info = (leftRowCount * lImp + rightRowCount * rImp) / (leftRowCount + rightRowCount); // if (info < minImp && !Double.IsNaN(info) && leftRowCount >= Def.TreeMinNumberOfCasesPerNode && rightRowCount >= Def.TreeMinNumberOfCasesPerNode) { // minImp = info; // p.SplitValue = AvcLst[thresholdIndexLst[i]].N0; // n.DescendentImpPreCalculated[0] = lImp; // n.DescendentImpPreCalculated[1] = rImp; // } // } // } // if (Double.IsNaN(minImp)) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // List<double> nLst; // sql = @" // SELECT // DISTINCT " + p.Variable.Name + " " + // "FROM " // + Def.DbBsTb + " , " + Def.DbTrTb + n.Id + " " + // "WHERE " // + Def.DbBsTb + "." + Def.DbTableIdName + "=" + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + // p.Variable.Name + " IS NOT NULL " + // "ORDER BY " // + p.Variable.Name + " ASC "; // nLst = Def.Db.GetNumberLst(sql); // //Finds the angle bisector of the slit // if (p.SplitValue != nLst[0] && p.SplitValue != nLst[nLst.Count - 2]) { // for (i = 1; i < nLst.Count - 2; ++i) { // if (p.SplitValue == nLst[i]) { // p.SplitValue = (nLst[i - 1] + nLst[i + 1]) / 2; // break; // } // } // } // p.Gain = (n.Imp - minImp) * 100 / n.Imp; // p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; // return p.Gain; // } public static double Killed_MinRandomImp(NodeTargetCategorical n, Predictor p) { //For some partitions gets the min Impiance int valueCount = p.DistinctValuesCount; int pos, instanceI; double partitionCount; int c, i; double bestPartition = 0, minImp = Double.NaN, imp = Double.NaN, lImp = Double.NaN, rImp = Double.NaN; int nodeLfItemCount, nodeRtItemCount; List<NTT> nttLst; string binStr = ""; p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; n.DescendentImpPreCalculated = new List<double>(2); n.DescendentImpPreCalculated.Add(0); n.DescendentImpPreCalculated.Add(0); List<string> lComb = new List<string>(valueCount); List<string> rComb = new List<string>(valueCount); SortedList<string, int> lPredVal = new SortedList<string, int>(); SortedList<string, int> rPredVal = new SortedList<string, int>(); string sql = @"SELECT ALL " + " count(*), " + Def.DbBsTb + "." + p.Variable.Name + ", " + Def.DbBsTb + "." + Def.Schema.Target.Name + " " + "FROM " + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + " = " + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + "GROUP BY " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + Def.DbBsTb + "." + p.Variable.Name + " "; nttLst = Def.Db.GetNTTLst(sql); int instanceCount = n.Table.RowCount; partitionCount = Math.Pow(2, valueCount - 1) - 1; double partitionCountMax = 0; if (partitionCount > 4095) { partitionCountMax = 4095; } else partitionCountMax = partitionCount; List<double> partLst = new List<double>((int)partitionCountMax); i = 1; //CHECK for (int t = 0; t < partitionCountMax; ++t) { partLst.Add((int)RNG.GetUniform(i, partitionCount)); ++i; } for (i = 0; i < partitionCountMax; ++i) { pos = 0; binStr = Fcn.Decimal2BinaryStr(partLst[(int)i]); lComb.Clear(); rComb.Clear(); lPredVal.Clear(); rPredVal.Clear(); nodeLfItemCount = nodeRtItemCount = 0; for (c = 0; c < p.ValueSd.Count; ++c) { if (binStr[binStr.Length - 1 - c] == '1') lComb.Add(p.ValueSd.Keys[pos]); //if the 'case' is in, put it on the left else rComb.Add(p.ValueSd.Keys[pos]);//else, in the right side ++pos; } for (instanceI = 0; instanceI < nttLst.Count; ++instanceI) { foreach (string ls in lComb) { if (nttLst[instanceI].T0 == ls) { if (!lPredVal.ContainsKey(nttLst[instanceI].T1)) lPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); else lPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; nodeLfItemCount += (int)nttLst[instanceI].N; break; } } foreach (string rs in rComb) { if (nttLst[instanceI].T0 == rs) { if (!rPredVal.ContainsKey(nttLst[instanceI].T1)) rPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); else rPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; nodeRtItemCount += (int)nttLst[instanceI].N; break; } } } if (nodeLfItemCount >= Def.TreeMinNumberOfCasesPerNode && nodeRtItemCount >= Def.TreeMinNumberOfCasesPerNode) { lImp = ImpCat(lPredVal, nodeLfItemCount); rImp = ImpCat(rPredVal, nodeRtItemCount); if (Double.IsNaN(minImp)) { bestPartition = partLst[(int)i]; minImp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } else { imp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; if (imp < minImp) { minImp = imp; bestPartition = partLst[(int)i]; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } } } } if (bestPartition == 0) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } //Set the possible children ValueGroup valueGroup = new ValueGroup(p, 2); p.ChildrenGroups = valueGroup; pos = 0; binStr = Fcn.Decimal2BinaryStr((double)bestPartition); for (c = 0; c < p.ValueSd.Count; ++c) { // if ((bestPartition & c) == c) if (binStr[binStr.Length - 1 - c] == '1') valueGroup.AddValueFromIndex(pos, 0); //if the 'case' is in put it on the left else valueGroup.AddValueFromIndex(pos, 1);//else, in the right side ++pos; } p.Gain = (n.Imp - minImp) * 100 / n.Imp; p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; return p.Gain; }
public static double VarianceTimesCount(List<string> vals, Node node, Predictor pred) { string sql = "", valSql=""; List<double> NLst = null; for (int i = 0; i < vals.Count; ++i) { valSql += pred.Variable.Name + "='" + vals[i] + "' "; if (i < (vals.Count - 1)) { valSql += " or "; } else valSql += ")"; } sql = @"SELECT " + "COALESCE(variance(" + Def.Tree.Schema.Target.Name + "), 0), count(*) " + "FROM " + Def.DbBsTb + " , " + Def.DbTrTb + node.Id + " " + "WHERE (" + Def.DbBsTb + "." + Def.DbTableIdName + "=" + Def.DbTrTb + node.Id + "." + Def.DbTableIdName + ") and (" + valSql; NLst = Def.Db.GetNumberRowLst(sql); return NLst[0] * NLst[1]; }
//============================================================================================================ //============================================================================================================ private static double PartitionInfo(List<string> leftLst, List<NNT> dataLst, Node node, Predictor pred) { //Returns Double.NaN if the variance can not be calculated int di; string indep; double dep, leftSum = 0, leftSum2 = 0, rightSum = 0, rightSum2 = 0; double leftVar, rightVar, freq, leftRowCounter = 0, rightRowCounter = 0, partitionInfo; for (di = 0; di < dataLst.Count; ++di) { dep = dataLst[di].N0; freq = dataLst[di].N1; indep = dataLst[di].T; if (leftLst.Contains(indep)) { leftSum += dep * freq; leftSum2 += dep * dep * freq; leftRowCounter += freq; } else { rightSum += dep * freq; rightSum2 += dep * dep * freq; rightRowCounter += freq; } } if (leftRowCounter < Def.TreeMinNumberOfCasesPerNode || rightRowCounter < Def.TreeMinNumberOfCasesPerNode) { return double.NaN; } leftVar = ((leftSum2 * leftRowCounter) - (leftSum * leftSum)) / (leftRowCounter * (leftRowCounter - 1)); rightVar = ((rightSum2 * rightRowCounter) - (rightSum * rightSum)) / (rightRowCounter * (rightRowCounter - 1)); partitionInfo = ((leftVar * leftRowCounter) + (rightVar * rightRowCounter)) / (leftRowCounter + rightRowCounter); return partitionInfo; }
//============================================================================================================ //============================================================================================================ //Sets the best p.SplitValue p.Gain //public static double MinInfoCatHeuristicSLOW(NodeTargetContinuous n, Predictor p) { // int i, bestPartitionSplitPoint = 0; // // int dfd; //delete // double minVar = 0, var, lVar, rVar, instanceCount = n.Table.RowCount; // lVar = rVar = minVar = var = 0; // List<NNT> nntLst; // List<string> left = new List<string>(); // List<string> right = new List<string>(); // List<string> leftBest = new List<string>(); // List<string> rightBest = new List<string>(); // //Tries each partition: // //AvsLst[i].N0 = y // //AvsLst[i].N1 = Value of the dependent varible // //AvsLst[i].N2 = Frequency of y // //AvsLst[i].N3 = Total of distinct registries until that row // string sql = // @"SELECT count(*), " + // "0, " + // Def.DbBsTb + "." + p.Variable.Name + " " + // "FROM " // + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + // "WHERE " // + Def.DbBsTb + "." + Def.DbTableIdName + " = " + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " " + // "GROUP BY " + // Def.DbBsTb + "." + p.Variable.Name + " " + // "ORDER BY avg(" // + Def.DbBsTb + "." + Def.Schema.Target.Name + ")"; // nntLst = Def.Db.GetNNTLst(sql); // //N1 is the number of registries until a given row // left.Add(nntLst[0].T); // leftBest.Add(nntLst[0].T); // if (nntLst.Count > 0) { // nntLst[0].N1 = nntLst[0].N0; // for (i = 1; i < nntLst.Count; ++i) { // nntLst[i].N1 = nntLst[i - 1].N1 + nntLst[i].N0; // right.Add(nntLst[i].T); // rightBest.Add(nntLst[i].T); // } // } // n.DescendentImpPreCalculated = new List<double>(2); // n.DescendentImpPreCalculated.Add(0); // n.DescendentImpPreCalculated.Add(0); // p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; // if (nntLst.Count == 0) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // for (i = 0; i < nntLst.Count - 1; ++i) { // //if(i==12 && p.Variable.Name=="apache_2") // // dfd=8; // lVar = Fcn.InfoTimesCount(left, n, p); // rVar = Fcn.InfoTimesCount(right, n, p); // if (i == 0) { // minVar = (lVar + rVar) / instanceCount; // n.DescendentImpPreCalculated[0] = lVar; // n.DescendentImpPreCalculated[1] = rVar; // } else { // var = (lVar + rVar) / instanceCount; // if (var < minVar) { // minVar = var; // bestPartitionSplitPoint = i; // n.DescendentImpPreCalculated[0] = lVar; // n.DescendentImpPreCalculated[1] = rVar; // leftBest.Clear(); // rightBest.Clear(); // foreach (string s in left) // leftBest.Add(s); // foreach (string s in right) // rightBest.Add(s); // } // } // left.Add(right[0]); // right.RemoveAt(0); // } // p.ChildrenGroups.ValueLst[0].Clear(); // p.ChildrenGroups.ValueLst[1].Clear(); // //Add left node values // for (i = 0; i <= bestPartitionSplitPoint; ++i) // p.ChildrenGroups.ValueLst[0].Add(p.CaseSd.IndexOfKey(nntLst[i].T)); // //Add right node values // for (i = bestPartitionSplitPoint + 1; i < nntLst.Count; ++i) // p.ChildrenGroups.ValueLst[1].Add(p.CaseSd.IndexOfKey(nntLst[i].T)); // p.Gain = (n.Imp - minVar) * 100 / n.Imp; // return p.Gain; //} ////============================================================================================================ ////============================================================================================================ //Sets the best p.SplitValue p.Gain public static double MinInfoCont(NodeTargetContinuous n, Predictor p) { int i, leftRowCount = 0, rightRowCount = 0; // int dfd; //delete double minVar, var, lVar, rVar, instanceCount = n.Table.RowCount; lVar = rVar = minVar = var = double.NaN; List<N4> AvsLst; List<int> thresholdIndexLst; //Tries each partition: //AvsLst[i].N0 = y //AvsLst[i].N1 = Value of the dependent varible //AvsLst[i].N2 = Frequency of y //AvsLst[i].N3 = Total of distinct registries until that row string sql = @"SELECT ALL " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + Def.DbBsTb + "." + p.Variable.Name + ", " + " count(" + Def.DbBsTb + "." + p.Variable.Name + "), 0 " + "FROM " + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + " = " + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " " + " AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + "GROUP BY " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + Def.DbBsTb + "." + p.Variable.Name + " " + "ORDER BY " + Def.DbBsTb + "." + p.Variable.Name; AvsLst = Def.Db.GetN4Lst(sql); //N3 is the number of registries until a given row if (AvsLst.Count > 0) { AvsLst[0].N3 = AvsLst[0].N2; for (i = 1; i < AvsLst.Count; ++i) AvsLst[i].N3 = AvsLst[i - 1].N3 + AvsLst[i].N2; } thresholdIndexLst = Fcn.SetPossibleThresholdIndexLst(AvsLst); if (thresholdIndexLst.Count == 0) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } n.DescendentImpPreCalculated = new List<double>(2); n.DescendentImpPreCalculated.Add(0); n.DescendentImpPreCalculated.Add(0); p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; if (AvsLst.Count == 0) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } for (i = 0; i < thresholdIndexLst.Count; ++i) { lVar = Info(0, thresholdIndexLst[i], AvsLst, out leftRowCount); rVar = Info(thresholdIndexLst[i] + 1, AvsLst.Count - 1, AvsLst, out rightRowCount); if (Double.IsNaN(minVar) && leftRowCount >= Def.TreeMinNumberOfCasesPerNode && rightRowCount >= Def.TreeMinNumberOfCasesPerNode) { var = (leftRowCount * lVar + rightRowCount * rVar) / (leftRowCount + rightRowCount); if (Double.IsNaN(var) == false) { minVar = var; p.SplitValue = AvsLst[thresholdIndexLst[i]].N1; n.DescendentImpPreCalculated[0] = lVar; n.DescendentImpPreCalculated[1] = rVar; } } else { var = (leftRowCount * lVar + rightRowCount * rVar) / (leftRowCount + rightRowCount); if (var < minVar && !Double.IsNaN(var) && leftRowCount >= Def.TreeMinNumberOfCasesPerNode && rightRowCount >= Def.TreeMinNumberOfCasesPerNode) { minVar = var; p.SplitValue = AvsLst[thresholdIndexLst[i]].N1; n.DescendentImpPreCalculated[0] = lVar; n.DescendentImpPreCalculated[1] = rVar; } } } if (Double.IsNaN(minVar)) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } p.Gain = (n.Imp - minVar) * 100 / n.Imp; p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; return p.Gain; }
//Sets the best p.SplitValue p.Gain //public static double MinInfoContSQL(NodeTargetContinuous n, Predictor p) { // int i; // double l, r, minVar, var; // l = r = minVar = var = 0; // //Tries each partition: // //p.Gain = (n.Imp - minVar) * 100 / n.Imp; // List<double> vlLst; // List<double> NLst; // OdbcTransaction dbTrans = null; // dbTrans = Def.Db.Con.BeginTransaction(); // string sql = // @"SELECT DISTINCT " + // Def.DbBsTb + "." + p.Variable.Name + " " + // "FROM " + // Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + // "WHERE " + // Def.DbBsTb + "." + Def.DbTableIdName + " = " + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " " + // "ORDER BY " + // Def.DbBsTb + "." + p.Variable.Name; // vlLst = Def.Db.GetNumberLst(sql, dbTrans); // p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; // if (vlLst.Count == 0) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // for (i = 0; i < vlLst.Count; ++i) { // sql = // @"SELECT // COALESCE(variance(" + n.Tree.Schema.Target.Name + "), 0), count(*) " + // "FROM " // + Def.DbBsTb + " , " + Def.DbTrTb + n.Id + " " + // "WHERE " // + Def.DbBsTb + "." + Def.DbTableIdName + "=" + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " and " + // Def.DbBsTb + "." + p.Variable.Name + "<=" + vlLst[i]; // NLst = Def.Db.GetNumberRowLst(sql, dbTrans); // l = NLst[0] * NLst[1] / n.Table.RowCount; // sql = // @"SELECT // COALESCE(variance(" + n.Tree.Schema.Target.Name + "), 0), count(*) " + // "FROM " // + Def.DbBsTb + " , " + Def.DbTrTb + n.Id + " " + // "WHERE " // + Def.DbBsTb + "." + Def.DbTableIdName + "=" + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " and " + // Def.DbBsTb + "." + p.Variable.Name + ">" + vlLst[i]; // NLst = Def.Db.GetNumberRowLst(sql, dbTrans); // r = NLst[0] * NLst[1] / n.Table.RowCount; // var = (l + r); // if (i == 0) { // minVar = var; // p.SplitValue = vlLst[i]; // } else { // if (var < minVar) { // minVar = var; // p.SplitValue = vlLst[i]; // } // } // } // dbTrans.Commit(); // p.Gain = (n.Imp - minVar) * 100 / n.Imp; ; // return p.Gain; //} //============================================================================================================ //============================================================================================================ //Sets the best p.SplitValue p.Gain //Used to be: public static double MinInfoCatHeuristic(NodeTargetContinuous n, Predictor p) { public static double MinInfoCat(NodeTargetContinuous n, Predictor p) { int i, bestPartitionSplitPoint = 0; double minVar, lVar, rVar, instanceCount = n.Table.RowCount, partitionInfo; lVar = rVar = minVar = double.NaN; List<string> valLst; List<NNT> DepIndepLst; List<string> left = new List<string>(); string sqlAverage = @"SELECT " + Def.DbBsTb + "." + p.Variable.Name + " " + "FROM " + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + " = " + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " " + "AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + "GROUP BY " + Def.DbBsTb + "." + p.Variable.Name + " " + "ORDER BY avg(" + Def.DbBsTb + "." + Def.Schema.Target.Name + ")"; valLst = Def.Db.GetTextLst(sqlAverage); if (valLst.Count == 0) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } string sqlDepVar = @"SELECT " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + "count(*), " + Def.DbBsTb + "." + p.Variable.Name + " " + "FROM " + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + " = " + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " " + "AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + "GROUP BY " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + Def.DbBsTb + "." + p.Variable.Name; DepIndepLst = Def.Db.GetNNTLst(sqlDepVar); p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; left.Add(valLst[0]); for (i = 0; i < valLst.Count - 1; ++i) { if (Double.IsNaN(minVar)) { partitionInfo = PartitionInfo(left, DepIndepLst, n, p); if (Double.IsNaN(partitionInfo) == false) { minVar = partitionInfo; bestPartitionSplitPoint = i; } } else { partitionInfo = PartitionInfo(left, DepIndepLst, n, p); if (Double.IsNaN(partitionInfo) == false && partitionInfo < minVar) { minVar = partitionInfo; bestPartitionSplitPoint = i; } } left.Add(valLst[i + 1]); } if(Double.IsNaN(minVar)){ p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } p.ChildrenGroups.ValueGroupLst[0].Clear(); p.ChildrenGroups.ValueGroupLst[1].Clear(); //Add left node values // int IndexOfKeyValLsti; // Just to check negative indexes for (i = 0; i <= bestPartitionSplitPoint; ++i) { // IndexOfKeyValLsti = p.ValueSd.IndexOfKey(valLst[i]); // if (IndexOfKeyValLsti < 0) // MessageBox.Show("Negative index in MinInfoCatHeuristic", "Error"); p.ChildrenGroups.ValueGroupLst[0].Add(valLst[i]); } //Add right node values for (i = bestPartitionSplitPoint + 1; i < valLst.Count; ++i) { // IndexOfKeyValLsti = p.ValueSd.IndexOfKey(valLst[i]); // if (IndexOfKeyValLsti < 0) // MessageBox.Show("Negative index in MinInfoCatHeuristic", "Error"); p.ChildrenGroups.ValueGroupLst[1].Add(valLst[i]); } p.Gain = (n.Imp - minVar) * 100 / n.Imp; p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; return p.Gain; }
public bool PredictorsFill(Node node) { Predictor pred = null; OdbcCommand cmd = null; OdbcDataReader reader = null; string qry = ""; bool r = false; if (con.State != ConnectionState.Open) con.Open(); if (con.State != ConnectionState.Open) { MessageBox.Show("Could not open the connection", "Error"); } try { foreach (SchemaVariable predVar in Def.Schema.PredictorLst) { pred = new Predictor(predVar, node, node.PredictorLst.Count); if (predVar.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Categorical) { qry = @"SELECT " + predVar.Name + ", count(" + predVar.Name + ") " + "FROM " + Def.DbBsTb + ", " + Def.DbTrTb + node.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + "=" + Def.DbTrTb + node.Id + "." + Def.DbTableIdName + " " + " and " + predVar.Name + " IS NOT NULL " + "GROUP BY " + predVar.Name; // qry = @"SELECT " + predVar.Name + ", count(" + predVar.Name + ") " + // "FROM dataset ds " + // "where Exists (select ref" + node.Id + "." + Def.DbTableIdName + " from ref" + node.Id + ", dataset " + // "where ds." + Def.DbTableIdName + "=ref" + node.Id + "." + Def.DbTableIdName + ") " + // "GROUP BY " + predVar.Name; cmd = new OdbcCommand(qry, con); reader = cmd.ExecuteReader(); while (reader.Read()) { pred.ValueSd.Add(Convert.ToString(reader[0]), Convert.ToInt32(reader[1])); } qry = @"SELECT COUNT(*)" + "FROM " + Def.DbBsTb + ", " + Def.DbTrTb + node.Id + " " + "where " + Def.DbBsTb + "." + Def.DbTableIdName + "=" + Def.DbTrTb + node.Id + "." + Def.DbTableIdName + " " + " and " + predVar.Name + " IS NULL"; cmd = new OdbcCommand(qry, con); reader = cmd.ExecuteReader(); reader.Read(); pred.NullCount = Convert.ToInt32(reader[0]); node.PredictorLst.Add(pred); node.PredCatLst.Add(pred); } else { //////////////////////////////////////////// /// CAN BE REDUCED TO ONLY ONE DATABASE PASS, JUST SELECT ALL VARIABLES AT THE SAME TIME (BELOW) //////////////////////////////////////////// qry = @"SELECT COALESCE(MIN(" + predVar.Name + "),0), COALESCE(MAX(" + predVar.Name + "), 0) " + "FROM " + Def.DbBsTb + ", " + Def.DbTrTb + node.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + "=" + Def.DbTrTb + node.Id + "." + Def.DbTableIdName + " and " + predVar.Name + " IS NOT NULL "; /* + "ORDER BY " + predVar.Name + " " + "LIMIT 1"; */ cmd = new OdbcCommand(qry, con); reader = cmd.ExecuteReader(); reader.Read(); pred.SetLowerAndHigher(Convert.ToDouble(reader[0]), Convert.ToDouble(reader[1])); qry = @"SELECT COUNT(DISTINCT " + predVar.Name + ")" + "FROM " + Def.DbBsTb + ", " + Def.DbTrTb + node.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + "=" + Def.DbTrTb + node.Id + "." + Def.DbTableIdName + " and " + predVar.Name + " IS NOT NULL"; cmd = new OdbcCommand(qry, con); reader = cmd.ExecuteReader(); reader.Read(); pred.DistinctValuesCount = Convert.ToInt32(reader[0]); qry = @"SELECT COUNT(*)" + "FROM " + Def.DbBsTb + ", " + Def.DbTrTb + node.Id + " " + "where " + Def.DbBsTb + "." + Def.DbTableIdName + "=" + Def.DbTrTb + node.Id + "." + Def.DbTableIdName + " " + " and " + predVar.Name + " IS NULL"; cmd = new OdbcCommand(qry, con); reader = cmd.ExecuteReader(); reader.Read(); pred.NullCount = Convert.ToInt32(reader[0]); node.PredictorLst.Add(pred); node.PredNumLst.Add(pred); } } r = true; } catch (Exception ex) { FE.Show(ex.Message, "Error could not execute PredictorsFill(Node node)", ex.StackTrace); } finally { reader.Close(); } return r; }