/// <summary> /// Receives one PredMvIdx to be tested, and the constant and returns: /// /// Return[0] the best value for the coefficient /// Return[1] the lowestSplitImp for this coefficient /// </summary> /// <param name="n"></param> /// <param name="bestCoefLst"></param> /// <param name="c"></param> /// <param name="testPredMvIdx"></param> /// <returns></returns> public static double[] BestGainMvNumeric(NodeTargetCategorical n, double c, int testPredMvIdx, List<int> optimisedVariableLst, FrmGraph fg) { double OptVarSum; bool infoNotYetCalculated = true; double leftImp = 0, rightImp = 0; double v, az, aSplit, xts,aSplitB=-9999999999; SortedList<double, int> azLst = new SortedList<double, int>(); double leftCount = 0, rightCount = 0;// Left and right item count SortedList<string, int> leftNode = new SortedList<string, int>(); SortedList<string, int> rightNode = new SortedList<string, int>(); double lowestSplitImp = 9999999, splitImp = 9999999; if (!Def.ExperimentRunning) { fg.ALst.Clear(); fg.AbsLst.Clear(); } azLst.Clear(); for (int y = 0; y < n.Table.RowCount; ++y) { OptVarSum = 0; //for (int i = 0; i < optimisedVariableLst.Count; ++i) { // OptVarSum += n.PredMvLst[optimisedVariableLst[i]].Coef * n.PredMvLst[optimisedVariableLst[i]].X(y); for (int i = 0; i < n.PredMvLst.Count; ++i) { OptVarSum += n.PredMvLst[i].Coef * n.PredMvLst[i].X(y); } v = OptVarSum + c; if (n.PredMvLst[testPredMvIdx].X(y) != 0) { az = ((n.PredMvLst[testPredMvIdx].Coef * n.PredMvLst[testPredMvIdx].X(y)) - v) / n.PredMvLst[testPredMvIdx].X(y); if (!azLst.ContainsKey(az) ) { azLst.Add(az, 1); if (!Def.ExperimentRunning) fg.ALst.Add((float)az); } } } for (int azIdx = 0; azIdx < azLst.Count - 1; ++azIdx) { //Finally we got a new linear coefficient for a variable that //had linear coefficient = 0 //The new hyperplane is: // A1X1 + (1)X2 + c = 0 (c= - bestSplitNorm) // (new midpoint coef) * testVar + (coordinate of the bestSplit variable) - bestSplitNorm; // aSplit * Fcn.NormP1(testLst[y], PredictorLst[i].LowerNumber, PredictorLst[i].HigherNumber); aSplit = (azLst.Keys[azIdx] + azLst.Keys[azIdx + 1]) / 2; leftNode.Clear(); rightNode.Clear(); leftCount = 0; rightCount = 0; if (!Def.ExperimentRunning) fg.AbsLst.Add((float)aSplit); for (int y = 0; y < n.Table.RowCount; ++y) { xts = n.PredMvLst[testPredMvIdx].X(y); OptVarSum = 0; for (int i = 0; i < optimisedVariableLst.Count; ++i) { OptVarSum += n.PredMvLst[optimisedVariableLst[i]].Coef * n.PredMvLst[optimisedVariableLst[i]].X(y); } if ((aSplit * xts + OptVarSum + c) <= 0) { ++leftCount; if (!leftNode.ContainsKey(n.MvTb.Data.TC[y])) leftNode.Add(n.MvTb.Data.TC[y], 1); else ++leftNode[n.MvTb.Data.TC[y]]; } else { ++rightCount; if (!rightNode.ContainsKey(n.MvTb.Data.TC[y])) rightNode.Add(n.MvTb.Data.TC[y], 1); else ++rightNode[n.MvTb.Data.TC[y]]; } } if (rightCount < Def.TreeMinNumberOfCasesPerNode || leftCount < Def.TreeMinNumberOfCasesPerNode) continue; leftImp = Gini.ImpCat(leftNode, (int)leftCount); rightImp = Gini.ImpCat(rightNode, (int)rightCount); if (infoNotYetCalculated) { infoNotYetCalculated = false; lowestSplitImp = (leftCount * leftImp + rightCount * rightImp) / n.Table.RowCount; if (n.PredMvLst[testPredMvIdx].Variable.VariableTypeDetected == SchemaVariable.VariableTypeEnum.Categorical) { MessageBox.Show("Error n.PredMvLst[testPredMvIdx].Variable is categorical "); } aSplitB = aSplit; } else { splitImp = (leftCount * leftImp + rightCount * rightImp) / n.Table.RowCount; if (splitImp < lowestSplitImp) { lowestSplitImp = splitImp; if (n.PredMvLst[testPredMvIdx].Variable.VariableTypeDetected == SchemaVariable.VariableTypeEnum.Categorical) { MessageBox.Show("Error n.PredMvLst[testPredMvIdx].Variable is categorical "); } aSplitB = aSplit; } } } //Creates bissections for the last and the 'x0 axis' if (azLst.Count > 1) { // if only 1 is the zero aSplit = azLst.Keys[azLst.Count-1] * 2; if (!Def.ExperimentRunning) fg.AbsLst.Add((float)aSplit); leftNode.Clear(); rightNode.Clear(); leftCount = 0; rightCount = 0; for (int y = 0; y < n.Table.RowCount; ++y) { xts = n.PredMvLst[testPredMvIdx].X(y); OptVarSum = 0; for (int i = 0; i < optimisedVariableLst.Count; ++i) { OptVarSum += n.PredMvLst[optimisedVariableLst[i]].Coef * n.PredMvLst[optimisedVariableLst[i]].X(y); } if ((aSplit * xts + OptVarSum + c) <= 0) { ++leftCount; if (!leftNode.ContainsKey(n.MvTb.Data.TC[y])) leftNode.Add(n.MvTb.Data.TC[y], 1); else ++leftNode[n.MvTb.Data.TC[y]]; } else { ++rightCount; if (!rightNode.ContainsKey(n.MvTb.Data.TC[y])) rightNode.Add(n.MvTb.Data.TC[y], 1); else ++rightNode[n.MvTb.Data.TC[y]]; } } if (rightCount >= Def.TreeMinNumberOfCasesPerNode && leftCount >= Def.TreeMinNumberOfCasesPerNode) { leftImp = Gini.ImpCat(leftNode, (int)leftCount); rightImp = Gini.ImpCat(rightNode, (int)rightCount); if (infoNotYetCalculated) { infoNotYetCalculated = false; lowestSplitImp = (leftCount * leftImp + rightCount * rightImp) / n.Table.RowCount; if (n.PredMvLst[testPredMvIdx].Variable.VariableTypeDetected == SchemaVariable.VariableTypeEnum.Categorical) { MessageBox.Show("Error n.PredMvLst[testPredMvIdx].Variable is categorical "); } aSplitB = aSplit; } else { splitImp = (leftCount * leftImp + rightCount * rightImp) / n.Table.RowCount; if (splitImp < lowestSplitImp) { lowestSplitImp = splitImp; if (n.PredMvLst[testPredMvIdx].Variable.VariableTypeDetected == SchemaVariable.VariableTypeEnum.Categorical) { MessageBox.Show("Error n.PredMvLst[testPredMvIdx].Variable is categorical "); } aSplitB = aSplit; } } }//end if (rightCount >= Def.TreeMinNumberOfCasesPerNode && leftCount >= Def.TreeMinNumberOfCasesPerNode) { } double[] r = new double[2]; r[0] = aSplitB; r[1] = lowestSplitImp; return r; }
/// Sets the best p.SplitHyperplane and p.Gain /// <summary> /// Look for the local minimum by fixing all the coeficients except one and then vary it /// </summary> public static double MinImpMvGreed(NodeTargetCategorical n, Predictor p) { bool hadOverallImprovement=false; //List<double> bestCoefLst = new List<double>(); //List<double> testCoefLst = new List<double>(); // VariableToBeTestedCoodinates SortedList<string, int> leftNode = new SortedList<string, int>(); SortedList<string, int> rightNode = new SortedList<string, int>(); List<int> nonOptimisedVariableLst = new List<int>(); //Stores the index of MV predictors that have been used List<int> optimisedVariableLst = new List<int>(); //Stores the index of MV predictors that have been used // starts with the best split index List<double> azLst = new List<double>(); n.C = 0; // (= best split coordinate) Constant of the equation A1X1 + A2X2 + C = V [0] normalised [1] original value double lowestImp = n.ImpBestUniSplit; double bestCoef=-1; double bestMvGain; double bestUniGain; int bestSplitIdx=-1; SortedList<double, int> orderedNominalGain = new SortedList<double, int>(); // value, index or PredictorLst SortedList<double, int> orderedNumericGain = new SortedList<double, int>(); // value, index or PredictorLst FrmGraph fg = new FrmGraph(); bestUniGain = (n.Imp - n.ImpBestUniSplit) * 100 / n.Imp; bestUniGain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; Def.LogMessage += "Starting MinImpMvGreed, smallest univariate imp= " + lowestImp + " bestUniSplit gain " + n.BestSplit.Gain + " should be equal bestUniGain " + bestUniGain + Environment.NewLine; for(int i=0; i < n.PredMvLst.Count; ++i){ n.PredMvLst[i].Optimised = false; n.PredMvLst[i].Coef = Def.AbsentCoefficientValue; nonOptimisedVariableLst.Add(i); } if (n.BestSplit.Variable.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Continuous) { //Sets the properties of the best univariate split #region nonOptimisedVariableLst.Remove(n.BestSplit.PredMvBase); optimisedVariableLst.Add(n.BestSplit.PredMvBase); n.PredMvLst[n.BestSplit.PredMvBase].Coef = 1; n.PredMvLst[n.BestSplit.PredMvBase].Optimised = true; n.C = -Fcn.NormP1(n.BestSplit.SplitValue, n.BestSplit.LowerNumber, n.BestSplit.HigherNumber); // c = 0; #endregion }else{//Nominal List<string> caseLst; n.C = 0; //Sets the coefficients of the values on the left with the Def.PresentCoefficientValue caseLst = n.PredictorLst[n.BestSplit.PredictorLstIdx].ChildrenGroups.ValueGroupLst[0]; for (int i = 0; i < caseLst.Count; ++i) { //Not sure if it is right MessageBox.Show("didn't undestand bellow"); n.PredMvLst[n.BestSplit.PredMvBase + n.PredictorLst[n.BestSplit.PredictorLstIdx].ValueSd.IndexOfKey(caseLst[i])].Coef = Def.PresentCoefficientValue; } //If the best split is nominal, then all the column there describe it are already optmised for (int i = 0; i < n.PredictorLst[n.BestSplit.PredictorLstIdx].DistinctValuesCount; ++i) { nonOptimisedVariableLst.Remove(n.BestSplit.PredMvBase + i); optimisedVariableLst.Add(n.BestSplit.PredMvBase + i); n.PredMvLst[n.BestSplit.PredMvBase + i].Optimised = true; } } //if (n.BestSplit.Gain == 100) { // Def.LogMessage += "Nothing to be done, univariate gain equals 100%" + Environment.NewLine; // n.MvTb.DataFill(); // DataValidate(n, c); // n.MvTb.DataEmpty(); // return p.Gain; //} double[] r= new double[2]; while(nonOptimisedVariableLst.Count > 0){ //while(optimisedVariableLst.Count < 11){ hadOverallImprovement=false; for (int nopVarIdx = 0; nopVarIdx < nonOptimisedVariableLst.Count; ++nopVarIdx) { ////Gets lowestCatSplitImp, catSplitImp; if (n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Variable.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Categorical) { r = BestGainMvCategorical(n, n.C, nonOptimisedVariableLst[nopVarIdx], optimisedVariableLst); if (r[1] < lowestImp) { bestCoef = r[0]; bestSplitIdx = nonOptimisedVariableLst[nopVarIdx]; lowestImp = r[1]; hadOverallImprovement=true; } } //Gets lowestNumSplitImp, numSplitImp; else { r = BestGainMvNumeric(n, n.C, nonOptimisedVariableLst[nopVarIdx], optimisedVariableLst, fg); if (r[1] < lowestImp) { bestCoef = r[0]; bestSplitIdx = nonOptimisedVariableLst[nopVarIdx]; lowestImp = r[1]; hadOverallImprovement=true; } } if (n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].FieldSpan > 0) Def.LogMessage += " nopVarIdx " + nonOptimisedVariableLst[nopVarIdx] + " Name " + n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Variable.Name + "(" + n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Offset + ")"; else Def.LogMessage += " nopVarIdx " + nonOptimisedVariableLst[nopVarIdx] + " Name " + n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Variable.Name; Def.LogMessage += " Coefficient " + r[0] + " SplitImp " + r[1] + Environment.NewLine; }//End for if (hadOverallImprovement) { if (!nonOptimisedVariableLst.Contains(bestSplitIdx)) { Def.LogMessage += "Error couldn't find bestSplitIdx " + bestSplitIdx + " inside nonOptimisedVariableLst " + Environment.NewLine; MessageBox.Show("Error couldn't find bestSplitIdx " + bestSplitIdx + " inside nonOptimisedVariableLst "); return -1; } n.PredMvLst[bestSplitIdx].Coef = bestCoef; n.PredMvLst[bestSplitIdx].Optimised = true; nonOptimisedVariableLst.Remove(bestSplitIdx); optimisedVariableLst.Add(bestSplitIdx); Def.LogMessage += "Best variable " + n.PredMvLst[bestSplitIdx].Variable.Name + n.PredMvLst[bestSplitIdx].Offset + " Coef " + bestCoef + " lowestCatSplitImp " + lowestImp + "++++++++++++++++++++" + Environment.NewLine; } else { //No improvement Def.LogMessage += "No more improvement " + (optimisedVariableLst.Count) + " variables of " + n.PredMvLst.Count + " are being combined on the purity function -------------------" + Environment.NewLine; foreach (int predMvIdx in nonOptimisedVariableLst) { optimisedVariableLst.Add(predMvIdx); n.PredMvLst[predMvIdx].Optimised = true; } nonOptimisedVariableLst.Clear(); } if (lowestImp==0) { Def.LogMessage += "Node is now 100% pure stopping greed seach " + Environment.NewLine; break; } }//End while n.ImpBestMvSplit = lowestImp; p.Gain = n.BestSplit.Gain; bestMvGain = (n.Imp - n.ImpBestMvSplit) * 100 / n.Imp; bestMvGain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; bestUniGain = (n.Imp - n.ImpBestUniSplit) * 100 / n.Imp; bestUniGain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; string coefStr = ""; for (int i = 0; i < n.PredMvLst.Count; ++i) { if (n.PredMvLst[i].Variable.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Continuous) coefStr += n.PredMvLst[i].Coef + "(" + n.PredMvLst[i].Variable.Name + ") "; else coefStr += n.PredMvLst[i].Coef + "(" + n.PredMvLst[i].Variable.Name + n.PredMvLst[i].Offset + ") "; } coefStr += "c = " + n.C; //Why UniGain is sometimes different to p.Gain Def.LogMessage += "MvGain: " + bestMvGain + " UniGain: " + bestUniGain + " P.Gain: " + p.Gain + " MvImp: " + lowestImp + " UniImp: " + n.ImpBestUniSplit + " NdImp: " + n.Imp + Environment.NewLine + coefStr + Environment.NewLine; if (!Def.ExperimentRunning) { for (int y = 0; y < n.Table.RowCount; ++y) { fg.x0Lst.Add((float) n.PredMvLst[0].X(y)); fg.x1Lst.Add((float) n.PredMvLst[1].X(y)); if (n.MvTb.Data.TC[y].ToLower() == "n") { fg.n.Add(y); } } fg.a0=(float) n.PredMvLst[0].Coef; fg.a1 = (float) n.PredMvLst[1].Coef; fg.c = (float) n.C; fg.N = n; fg.ABest = (float)bestCoef; fg.Show(); fg.Invalidate(); DataValidate(n, n.C); } //if auto n.Imp = return p.Gain; }