Example #1
0
        /// <summary>
        /// Receives one PredMvIdx to be tested, and the constant and returns:
        /// 
        /// Return[0] the best value for the coefficient 
        /// Return[1] the lowestSplitImp for this coefficient
        /// </summary>
        /// <param name="n"></param>
        /// <param name="bestCoefLst"></param>
        /// <param name="c"></param>
        /// <param name="testPredMvIdx"></param>
        /// <returns></returns>
        public static double[] BestGainMvNumeric(NodeTargetCategorical n, double c, int testPredMvIdx, List<int> optimisedVariableLst, FrmGraph fg)
        {
            double OptVarSum;
            bool infoNotYetCalculated = true;
            double leftImp = 0, rightImp = 0;
            double v, az, aSplit, xts,aSplitB=-9999999999;
            SortedList<double, int> azLst = new SortedList<double, int>();
            double leftCount = 0, rightCount = 0;// Left and right item count
            SortedList<string, int> leftNode = new SortedList<string, int>();
            SortedList<string, int> rightNode = new SortedList<string, int>();
            double lowestSplitImp = 9999999, splitImp = 9999999;

            if (!Def.ExperimentRunning) {
                fg.ALst.Clear();
                fg.AbsLst.Clear();
            }

            azLst.Clear();
            for (int y = 0; y < n.Table.RowCount; ++y) {
                OptVarSum = 0;
                //for (int i = 0; i < optimisedVariableLst.Count; ++i) {
                //    OptVarSum += n.PredMvLst[optimisedVariableLst[i]].Coef * n.PredMvLst[optimisedVariableLst[i]].X(y);
                for (int i = 0; i < n.PredMvLst.Count; ++i) {
                    OptVarSum += n.PredMvLst[i].Coef * n.PredMvLst[i].X(y);
                }
                v = OptVarSum + c;
                if (n.PredMvLst[testPredMvIdx].X(y) != 0) {
                    az = ((n.PredMvLst[testPredMvIdx].Coef * n.PredMvLst[testPredMvIdx].X(y)) - v) / n.PredMvLst[testPredMvIdx].X(y);
                    if (!azLst.ContainsKey(az) ) {
                        azLst.Add(az, 1);
                        if (!Def.ExperimentRunning)
                            fg.ALst.Add((float)az);
                    }
                }
            }
            for (int azIdx = 0; azIdx < azLst.Count - 1; ++azIdx) {
                //Finally we got a new linear coefficient for a variable that
                //had linear coefficient = 0
                //The new hyperplane is:
                // A1X1 + (1)X2 + c = 0 (c= - bestSplitNorm)
                // (new midpoint coef) * testVar + (coordinate of the bestSplit variable) - bestSplitNorm;
                // aSplit * Fcn.NormP1(testLst[y], PredictorLst[i].LowerNumber, PredictorLst[i].HigherNumber);
                aSplit = (azLst.Keys[azIdx] + azLst.Keys[azIdx + 1]) / 2;
                leftNode.Clear();
                rightNode.Clear();
                leftCount = 0;
                rightCount = 0;
                if (!Def.ExperimentRunning)
                    fg.AbsLst.Add((float)aSplit);
                for (int y = 0; y < n.Table.RowCount; ++y) {

                    xts = n.PredMvLst[testPredMvIdx].X(y);
                    OptVarSum = 0;
                    for (int i = 0; i < optimisedVariableLst.Count; ++i) {
                        OptVarSum += n.PredMvLst[optimisedVariableLst[i]].Coef * n.PredMvLst[optimisedVariableLst[i]].X(y);
                    }
                    if ((aSplit * xts + OptVarSum + c) <= 0) {
                        ++leftCount;
                        if (!leftNode.ContainsKey(n.MvTb.Data.TC[y]))
                            leftNode.Add(n.MvTb.Data.TC[y], 1);
                        else
                            ++leftNode[n.MvTb.Data.TC[y]];
                    } else {
                        ++rightCount;
                        if (!rightNode.ContainsKey(n.MvTb.Data.TC[y]))
                            rightNode.Add(n.MvTb.Data.TC[y], 1);
                        else
                            ++rightNode[n.MvTb.Data.TC[y]];
                    }
                }
                if (rightCount < Def.TreeMinNumberOfCasesPerNode || leftCount < Def.TreeMinNumberOfCasesPerNode)
                    continue;
                leftImp = Gini.ImpCat(leftNode, (int)leftCount);
                rightImp = Gini.ImpCat(rightNode, (int)rightCount);
                if (infoNotYetCalculated) {
                    infoNotYetCalculated = false;
                    lowestSplitImp = (leftCount * leftImp + rightCount * rightImp) / n.Table.RowCount;
                    if (n.PredMvLst[testPredMvIdx].Variable.VariableTypeDetected == SchemaVariable.VariableTypeEnum.Categorical) {
                        MessageBox.Show("Error n.PredMvLst[testPredMvIdx].Variable is categorical ");
                    }
                    aSplitB = aSplit;
                } else {
                    splitImp = (leftCount * leftImp + rightCount * rightImp) / n.Table.RowCount;
                    if (splitImp < lowestSplitImp) {
                        lowestSplitImp = splitImp;
                        if (n.PredMvLst[testPredMvIdx].Variable.VariableTypeDetected == SchemaVariable.VariableTypeEnum.Categorical) {
                            MessageBox.Show("Error n.PredMvLst[testPredMvIdx].Variable is categorical ");
                        }
                        aSplitB = aSplit;
                    }
                }
            }

            //Creates bissections for the last and the 'x0 axis'
            if (azLst.Count > 1) { // if only 1 is the zero
                aSplit = azLst.Keys[azLst.Count-1] * 2;
                if (!Def.ExperimentRunning)
                    fg.AbsLst.Add((float)aSplit);
                leftNode.Clear();
                rightNode.Clear();
                leftCount = 0;
                rightCount = 0;
                for (int y = 0; y < n.Table.RowCount; ++y) {
                    xts = n.PredMvLst[testPredMvIdx].X(y);
                    OptVarSum = 0;
                    for (int i = 0; i < optimisedVariableLst.Count; ++i) {
                        OptVarSum += n.PredMvLst[optimisedVariableLst[i]].Coef * n.PredMvLst[optimisedVariableLst[i]].X(y);
                    }
                    if ((aSplit * xts + OptVarSum + c) <= 0) {
                        ++leftCount;
                        if (!leftNode.ContainsKey(n.MvTb.Data.TC[y]))
                            leftNode.Add(n.MvTb.Data.TC[y], 1);
                        else
                            ++leftNode[n.MvTb.Data.TC[y]];
                    } else {
                        ++rightCount;
                        if (!rightNode.ContainsKey(n.MvTb.Data.TC[y]))
                            rightNode.Add(n.MvTb.Data.TC[y], 1);
                        else
                            ++rightNode[n.MvTb.Data.TC[y]];
                    }
                }
                if (rightCount >= Def.TreeMinNumberOfCasesPerNode && leftCount >= Def.TreeMinNumberOfCasesPerNode) {
                    leftImp = Gini.ImpCat(leftNode, (int)leftCount);
                    rightImp = Gini.ImpCat(rightNode, (int)rightCount);
                    if (infoNotYetCalculated) {
                        infoNotYetCalculated = false;
                        lowestSplitImp = (leftCount * leftImp + rightCount * rightImp) / n.Table.RowCount;
                        if (n.PredMvLst[testPredMvIdx].Variable.VariableTypeDetected == SchemaVariable.VariableTypeEnum.Categorical) {
                            MessageBox.Show("Error n.PredMvLst[testPredMvIdx].Variable is categorical ");
                        }
                        aSplitB = aSplit;
                    } else {
                        splitImp = (leftCount * leftImp + rightCount * rightImp) / n.Table.RowCount;
                        if (splitImp < lowestSplitImp) {
                            lowestSplitImp = splitImp;
                            if (n.PredMvLst[testPredMvIdx].Variable.VariableTypeDetected == SchemaVariable.VariableTypeEnum.Categorical) {
                                MessageBox.Show("Error n.PredMvLst[testPredMvIdx].Variable is categorical ");
                            }
                            aSplitB = aSplit;
                        }
                    }
                }//end if (rightCount >= Def.TreeMinNumberOfCasesPerNode && leftCount >= Def.TreeMinNumberOfCasesPerNode) {
            }
            double[] r = new double[2];
            r[0] = aSplitB;
            r[1] = lowestSplitImp;
            return r;
        }
Example #2
0
        /// Sets the best p.SplitHyperplane and p.Gain
        /// <summary>
        /// Look for the local minimum by fixing all the coeficients except one and then vary it
        /// </summary>
        public static double MinImpMvGreed(NodeTargetCategorical n, Predictor p)
        {
            bool hadOverallImprovement=false;
            //List<double> bestCoefLst = new List<double>();
            //List<double> testCoefLst = new List<double>(); // VariableToBeTestedCoodinates
            SortedList<string, int> leftNode = new SortedList<string, int>();
            SortedList<string, int> rightNode = new SortedList<string, int>();
            List<int> nonOptimisedVariableLst = new List<int>(); //Stores the index of MV predictors that have been used
            List<int> optimisedVariableLst = new List<int>(); //Stores the index of MV predictors that have been used
            // starts with the best split index
            List<double> azLst = new List<double>();
            n.C = 0; // (= best split coordinate) Constant of the equation A1X1 + A2X2 + C = V [0] normalised [1] original value
            double lowestImp = n.ImpBestUniSplit;
            double bestCoef=-1;
            double bestMvGain;
            double bestUniGain;
            int bestSplitIdx=-1;
            SortedList<double, int> orderedNominalGain = new SortedList<double, int>(); // value, index or PredictorLst
            SortedList<double, int> orderedNumericGain = new SortedList<double, int>(); // value, index or PredictorLst

            FrmGraph fg = new FrmGraph();

            bestUniGain = (n.Imp - n.ImpBestUniSplit) * 100 / n.Imp;
            bestUniGain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount;

            Def.LogMessage += "Starting MinImpMvGreed, smallest univariate imp= " + lowestImp + " bestUniSplit gain " + n.BestSplit.Gain + " should be equal bestUniGain " + bestUniGain + Environment.NewLine;

            for(int i=0; i < n.PredMvLst.Count; ++i){
                n.PredMvLst[i].Optimised = false;
                n.PredMvLst[i].Coef = Def.AbsentCoefficientValue;
                nonOptimisedVariableLst.Add(i);
            }
            if (n.BestSplit.Variable.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Continuous) {
                //Sets the properties of the best univariate split
                #region
                nonOptimisedVariableLst.Remove(n.BestSplit.PredMvBase);
                optimisedVariableLst.Add(n.BestSplit.PredMvBase);
                n.PredMvLst[n.BestSplit.PredMvBase].Coef = 1;
                n.PredMvLst[n.BestSplit.PredMvBase].Optimised = true;
                n.C = -Fcn.NormP1(n.BestSplit.SplitValue, n.BestSplit.LowerNumber, n.BestSplit.HigherNumber);
               // c = 0;
            #endregion
            }else{//Nominal
                List<string> caseLst;
                n.C = 0;
                //Sets the coefficients of the values on the left with the Def.PresentCoefficientValue
                caseLst = n.PredictorLst[n.BestSplit.PredictorLstIdx].ChildrenGroups.ValueGroupLst[0];
                for (int i = 0; i < caseLst.Count; ++i) {
                    //Not sure if it is right MessageBox.Show("didn't undestand bellow");
                    n.PredMvLst[n.BestSplit.PredMvBase + n.PredictorLst[n.BestSplit.PredictorLstIdx].ValueSd.IndexOfKey(caseLst[i])].Coef = Def.PresentCoefficientValue;
                }
                //If the best split is nominal, then all the column there describe it are already optmised
                for (int i = 0; i < n.PredictorLst[n.BestSplit.PredictorLstIdx].DistinctValuesCount; ++i) {
                    nonOptimisedVariableLst.Remove(n.BestSplit.PredMvBase + i);
                    optimisedVariableLst.Add(n.BestSplit.PredMvBase + i);
                    n.PredMvLst[n.BestSplit.PredMvBase + i].Optimised = true;
                 }
            }

            //if (n.BestSplit.Gain == 100) {
            //    Def.LogMessage += "Nothing to be done, univariate gain equals 100%" + Environment.NewLine;
            //    n.MvTb.DataFill();
            //    DataValidate(n, c);
            //    n.MvTb.DataEmpty();
            //    return p.Gain;
            //}

            double[] r= new double[2];

            while(nonOptimisedVariableLst.Count > 0){
             //while(optimisedVariableLst.Count < 11){
                hadOverallImprovement=false;
                for (int nopVarIdx = 0; nopVarIdx < nonOptimisedVariableLst.Count; ++nopVarIdx) {
                    ////Gets lowestCatSplitImp, catSplitImp;
                    if (n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Variable.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Categorical) {
                        r = BestGainMvCategorical(n, n.C, nonOptimisedVariableLst[nopVarIdx], optimisedVariableLst);
                        if (r[1] < lowestImp) {
                            bestCoef = r[0];
                            bestSplitIdx = nonOptimisedVariableLst[nopVarIdx];
                            lowestImp = r[1];
                            hadOverallImprovement=true;
                        }
                    }
                    //Gets lowestNumSplitImp, numSplitImp;
                    else {
                        r = BestGainMvNumeric(n, n.C, nonOptimisedVariableLst[nopVarIdx], optimisedVariableLst, fg);
                        if (r[1] < lowestImp) {
                            bestCoef = r[0];
                            bestSplitIdx = nonOptimisedVariableLst[nopVarIdx];
                            lowestImp = r[1];
                            hadOverallImprovement=true;
                        }
                    }
                    if (n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].FieldSpan > 0)
                        Def.LogMessage += "     nopVarIdx " + nonOptimisedVariableLst[nopVarIdx] + " Name " + n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Variable.Name + "(" + n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Offset + ")";
                    else
                        Def.LogMessage += "     nopVarIdx " + nonOptimisedVariableLst[nopVarIdx] + " Name " + n.PredMvLst[nonOptimisedVariableLst[nopVarIdx]].Variable.Name;
                    Def.LogMessage += "     Coefficient " + r[0] + " SplitImp " + r[1] + Environment.NewLine;
                }//End for
                if (hadOverallImprovement) {
                    if (!nonOptimisedVariableLst.Contains(bestSplitIdx)) {
                        Def.LogMessage += "Error couldn't find bestSplitIdx " + bestSplitIdx + " inside nonOptimisedVariableLst " + Environment.NewLine;
                        MessageBox.Show("Error couldn't find bestSplitIdx " + bestSplitIdx + " inside nonOptimisedVariableLst ");
                        return -1;
                    }
                    n.PredMvLst[bestSplitIdx].Coef = bestCoef;
                    n.PredMvLst[bestSplitIdx].Optimised = true;
                    nonOptimisedVariableLst.Remove(bestSplitIdx);
                    optimisedVariableLst.Add(bestSplitIdx);
                    Def.LogMessage += "Best variable " + n.PredMvLst[bestSplitIdx].Variable.Name + n.PredMvLst[bestSplitIdx].Offset + " Coef " + bestCoef + " lowestCatSplitImp " + lowestImp + "++++++++++++++++++++" + Environment.NewLine;
                } else { //No improvement
                    Def.LogMessage += "No more improvement " + (optimisedVariableLst.Count) + " variables of " + n.PredMvLst.Count + " are being combined on the purity function -------------------" + Environment.NewLine;
                    foreach (int predMvIdx in nonOptimisedVariableLst) {
                        optimisedVariableLst.Add(predMvIdx);
                        n.PredMvLst[predMvIdx].Optimised = true;
                    }
                    nonOptimisedVariableLst.Clear();
                }
                if (lowestImp==0) {
                    Def.LogMessage += "Node is now 100% pure stopping greed seach " + Environment.NewLine;
                    break;
                }
            }//End while
            n.ImpBestMvSplit = lowestImp;

            p.Gain = n.BestSplit.Gain;

            bestMvGain = (n.Imp - n.ImpBestMvSplit) * 100 / n.Imp;
            bestMvGain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount;

            bestUniGain = (n.Imp - n.ImpBestUniSplit) * 100 / n.Imp;
            bestUniGain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount;

            string coefStr = "";
            for (int i = 0; i < n.PredMvLst.Count; ++i) {
                if (n.PredMvLst[i].Variable.VariableTypeUserSet == SchemaVariable.VariableTypeEnum.Continuous)
                    coefStr += n.PredMvLst[i].Coef + "(" + n.PredMvLst[i].Variable.Name + ") ";
                else
                    coefStr += n.PredMvLst[i].Coef + "(" + n.PredMvLst[i].Variable.Name + n.PredMvLst[i].Offset + ") ";
            }
            coefStr += "c = " + n.C;
            //Why UniGain is sometimes different to p.Gain
            Def.LogMessage += "MvGain: " + bestMvGain + " UniGain: " + bestUniGain + " P.Gain: " + p.Gain + " MvImp: " + lowestImp + " UniImp: " + n.ImpBestUniSplit + " NdImp: " + n.Imp + Environment.NewLine + coefStr + Environment.NewLine;

            if (!Def.ExperimentRunning) {
                for (int y = 0; y < n.Table.RowCount; ++y) {
                              fg.x0Lst.Add((float) n.PredMvLst[0].X(y));
                        fg.x1Lst.Add((float) n.PredMvLst[1].X(y));
                    if (n.MvTb.Data.TC[y].ToLower() == "n") {

                        fg.n.Add(y);
                    }
                }
                fg.a0=(float) n.PredMvLst[0].Coef;
                fg.a1 = (float) n.PredMvLst[1].Coef;
                fg.c = (float) n.C;
                fg.N = n;
                fg.ABest = (float)bestCoef;
                fg.Show();
                fg.Invalidate();

                DataValidate(n, n.C);
            }
            //if auto n.Imp =
            return p.Gain;
        }