public FrmCategoricalSplitProperties(Node _node, Predictor _pred) { node = _node; pred = _pred; List<string> caseLst; InitializeComponent(); //If a split could not be defined, leave the 1st on the left and the rest on the right if (pred.ChildrenGroups == null) { //Set the possible children MessageBox.Show("This split has not been computed", "Warning!"); ValueGroup valueGroup = new ValueGroup(_pred, 2); pred.ChildrenGroups = valueGroup; lbLeft.Items.Add(pred.ValueSd.Keys[0]); for (int i = 1; i < pred.DistinctValuesCount; ++i) { lbRight.Items.Add(pred.ValueSd.Keys[i]); } checks(); return; } caseLst = pred.ChildrenGroups.ValueGroupLst[0]; for (int i = 0; i < caseLst.Count; ++i) { lbLeft.Items.Add(caseLst[i]); } caseLst = pred.ChildrenGroups.ValueGroupLst[1]; for (int i = 0; i < caseLst.Count; ++i) { lbRight.Items.Add(caseLst[i]); } checks(); }
public Predictor(SchemaVariable var, Node _node, int predictorLstIdx) { ChildrenGroups = new ValueGroup(this, 2); Variable = var; node = _node; PredictorLstIdx = predictorLstIdx; ValueSd = new SortedList<string,int>(); }
//It the number of variables is too large it can't compute //public static double KILLED_MinImpCatRandom(NodeTargetCategorical n, Predictor p) { // //For some partitions gets the min Impiance // int valueCount = p.DistinctValuesCount; // int pos, instanceI; // double partitionCount; // int c, i; // double bestPartition = 0, minImp = Double.NaN, imp = Double.NaN, lImp = Double.NaN, rImp = Double.NaN; // int nodeLfItemCount, nodeRtItemCount; // List<NTT> nttLst; // string binStr = ""; // p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; // n.DescendentImpPreCalculated = new List<double>(2); // n.DescendentImpPreCalculated.Add(0); // n.DescendentImpPreCalculated.Add(0); // List<string> lComb = new List<string>(valueCount); // List<string> rComb = new List<string>(valueCount); // SortedList<string, int> lPredVal = new SortedList<string, int>(); // SortedList<string, int> rPredVal = new SortedList<string, int>(); // string sql = // @"SELECT ALL " + // " count(*), " + // Def.DbBsTb + "." + p.Variable.Name + ", " + // Def.DbBsTb + "." + Def.Schema.Target.Name + " " + // "FROM " // + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + // "WHERE " // + Def.DbBsTb + "." + Def.DbTableIdName + " = " + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + // Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + // "GROUP BY " + // Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + // Def.DbBsTb + "." + p.Variable.Name + " "; // nttLst = Def.Db.GetNTTLst(sql); // int instanceCount = n.Table.RowCount; // partitionCount = Math.Pow(2, valueCount - 1) - 1; // double partitionCountMax = 0; // if (partitionCount > 4095) { // 4095 // partitionCountMax = 4095; // } else // partitionCountMax = partitionCount; // List<double> partLst = new List<double>((int)partitionCountMax); // i = 1; // //CHECK // for (int t = 0; t < partitionCountMax; ++t) { // partLst.Add((int)RNG.GetUniform(i, partitionCount)); // ++i; // } // for (i = 0; i < partitionCountMax; ++i) { // pos = 0; // binStr = Fcn.Decimal2BinaryStr(partLst[(int)i]); // lComb.Clear(); rComb.Clear(); // lPredVal.Clear(); rPredVal.Clear(); // nodeLfItemCount = nodeRtItemCount = 0; // for (c = 0; c < p.ValueSd.Count; ++c) { // if (binStr[binStr.Length - 1 - c] == '1') // lComb.Add(p.ValueSd.Keys[pos]); //if the 'case' is in, put it on the left // else // rComb.Add(p.ValueSd.Keys[pos]);//else, in the right side // ++pos; // } // for (instanceI = 0; instanceI < nttLst.Count; ++instanceI) { // foreach (string ls in lComb) { // if (nttLst[instanceI].T0 == ls) { // if (!lPredVal.ContainsKey(nttLst[instanceI].T1)) // lPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); // else // lPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; // nodeLfItemCount += (int)nttLst[instanceI].N; // break; // } // } // foreach (string rs in rComb) { // if (nttLst[instanceI].T0 == rs) { // if (!rPredVal.ContainsKey(nttLst[instanceI].T1)) // rPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); // else // rPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; // nodeRtItemCount += (int)nttLst[instanceI].N; // break; // } // } // } // if (nodeLfItemCount >= Def.TreeMinNumberOfCasesPerNode && nodeRtItemCount >= Def.TreeMinNumberOfCasesPerNode) { // lImp = ImpCat(lPredVal, nodeLfItemCount); // rImp = ImpCat(rPredVal, nodeRtItemCount); // if (Double.IsNaN(minImp)) { // bestPartition = partLst[(int)i]; // minImp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; // n.DescendentImpPreCalculated[0] = lImp; // n.DescendentImpPreCalculated[1] = rImp; // } else { // imp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; // if (imp < minImp) { // minImp = imp; // bestPartition = partLst[(int)i]; // n.DescendentImpPreCalculated[0] = lImp; // n.DescendentImpPreCalculated[1] = rImp; // } // } // } // } // if (bestPartition == 0) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // //Set the possible children // ValueGroup valueGroup = new ValueGroup(2); // p.ChildrenGroups = valueGroup; // pos = 0; // binStr = Fcn.Decimal2BinaryStr((double)bestPartition); // for (c = 0; c < p.ValueSd.Count; ++c) { // // if ((bestPartition & c) == c) // if (binStr[binStr.Length - 1 - c] == '1') // valueGroup.ValueGroupLst[0].Add(pos); //if the 'case' is in put it on the left // else // valueGroup.ValueGroupLst[1].Add(pos);//else, in the right side // ++pos; // } // n.ImpBestUniSplit = minImp; // p.Gain = (n.Imp - minImp) * 100 / n.Imp; // p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; // return p.Gain; //} //Sets the best p.Gain, valueGroup.ValueGroupLst[0] (left child) and valueGroup.ValueGroupLst[1] (right child) public static double MinImpCatFullSearch(NodeTargetCategorical n, Predictor p) { //For each possible partition gets the min Imp int valueCount = p.DistinctValuesCount; if (valueCount > Def.ClfMaxNumberOfValuesForFullSearch) { p.SplitStatus = Predictor.SplitStatusEnum.TooManyValuesToSearch; p.Gain = 0; return 0; } int partitionCount, pos, instanceI; uint c, i, bestPartition = 0; double minImp = Double.NaN, imp = Double.NaN, lImp = Double.NaN, rImp = Double.NaN; int nodeLfItemCount, nodeRtItemCount; List<NTT> nttLst; p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; n.DescendentImpPreCalculated = new List<double>(2); n.DescendentImpPreCalculated.Add(0); n.DescendentImpPreCalculated.Add(0); List<string> lComb = new List<string>(valueCount); List<string> rComb = new List<string>(valueCount); SortedList<string, int> lPredVal = new SortedList<string, int>(); SortedList<string, int> rPredVal = new SortedList<string, int>(); string sql = @"SELECT ALL " + " count(*), " + Def.DbBsTb + "." + p.Variable.Name + ", " + Def.DbBsTb + "." + Def.Schema.Target.Name + " " + "FROM " + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + " = " + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + "GROUP BY " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + Def.DbBsTb + "." + p.Variable.Name + " "; nttLst = Def.Db.GetNTTLst(sql); int instanceCount = n.Table.RowCount; partitionCount = (int)(Math.Pow(2, valueCount - 1) - 1); for (i = 1; i <= partitionCount; ++i) { //Enumerates all the possible partition but the empty pos = 0; lComb.Clear(); rComb.Clear(); lPredVal.Clear(); rPredVal.Clear(); nodeLfItemCount = nodeRtItemCount = 0; for (c = 1; c <= partitionCount + 1; c *= 2) { if ((i & c) == c) // i & c == c lComb.Add(p.ValueSd.Keys[pos]); //if the 'case' is in, put it on the left else rComb.Add(p.ValueSd.Keys[pos]);//else, in the right side ++pos; } for (instanceI = 0; instanceI < nttLst.Count; ++instanceI) { foreach (string ls in lComb) { if (nttLst[instanceI].T0 == ls) { if (!lPredVal.ContainsKey(nttLst[instanceI].T1)) lPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); else lPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; nodeLfItemCount += (int)nttLst[instanceI].N; break; } } foreach (string rs in rComb) { if (nttLst[instanceI].T0 == rs) { if (!rPredVal.ContainsKey(nttLst[instanceI].T1)) rPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); else rPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; nodeRtItemCount += (int)nttLst[instanceI].N; break; } } } if (nodeLfItemCount >= Def.TreeMinNumberOfCasesPerNode && nodeRtItemCount >= Def.TreeMinNumberOfCasesPerNode) { lImp = ImpCat(lPredVal, nodeLfItemCount); rImp = ImpCat(rPredVal, nodeRtItemCount); if (Double.IsNaN(minImp)) { bestPartition = i; minImp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } else { imp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; if (imp < minImp) { minImp = imp; bestPartition = i; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } } } } if (bestPartition == 0) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } //Set the possible children ValueGroup valueGroup = new ValueGroup(p, 2); p.ChildrenGroups = valueGroup; pos = 0; for (c = 1; c <= partitionCount + 1; c *= 2) { if ((bestPartition & c) == c) valueGroup.AddValueFromIndex(pos, 0); //if the 'case' is in put it on the left else valueGroup.AddValueFromIndex(pos, 1);//else, in the right side ++pos; } p.ImpUniMin = minImp; p.Gain = (n.Imp - minImp) * 100 / n.Imp; p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; return p.Gain; }
//Sets the best p.SplitValue p.Gain // public static double MinImpCont(NodeTargetCategorical n, Predictor p) { // int i, leftRowCount = 0, rightRowCount = 0; // // int dfd; //delete // double minImp, info, lImp, rImp, instanceCount = n.Table.RowCount; // lImp = rImp = minImp = info = double.NaN; // List<N3T> AvcLst; // List<int> thresholdIndexLst; // //Tries each partition: // //AvcLst[i].N0 = Value of the dependent varible // //AvcLst[i].N1 = Frequency of y // //AvcLst[i].N2 = Total of distinct registries until that row // //AvcLst[i].T = y // string sql = // @"SELECT ALL " + // Def.DbBsTb + "." + p.Variable.Name + ", " + // " count(" + Def.DbBsTb + "." + p.Variable.Name + "),0 , " + // Def.DbBsTb + "." + Def.Schema.Target.Name + " " + // "FROM " // + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + // "WHERE " // + Def.DbBsTb + "." + Def.DbTableIdName + " = " + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " " + // " AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + // "GROUP BY " + // Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + // Def.DbBsTb + "." + p.Variable.Name + " " + // "ORDER BY " + // Def.DbBsTb + "." + p.Variable.Name; // AvcLst = Def.Db.GetN3TLst(sql); // //N2 is the number of registries until a given row // if (AvcLst.Count > 0) { // AvcLst[0].N2 = AvcLst[0].N1; // for (i = 1; i < AvcLst.Count; ++i) // AvcLst[i].N2 = AvcLst[i - 1].N2 + AvcLst[i].N1; // } // thresholdIndexLst = Fcn.SetPossibleThresholdIndexLst(AvcLst); // if (thresholdIndexLst.Count == 0) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // n.DescendentImpPreCalculated = new List<double>(2); // n.DescendentImpPreCalculated.Add(0); // n.DescendentImpPreCalculated.Add(0); // p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; // if (AvcLst.Count == 0) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // for (i = 0; i < thresholdIndexLst.Count; ++i) { // lImp = ImpCont(0, thresholdIndexLst[i], AvcLst, out leftRowCount); // rImp = ImpCont(thresholdIndexLst[i] + 1, AvcLst.Count - 1, AvcLst, out rightRowCount); // if (Double.IsNaN(minImp) && leftRowCount >= Def.TreeMinNumberOfCasesPerNode && rightRowCount >= Def.TreeMinNumberOfCasesPerNode) { // info = (leftRowCount * lImp + rightRowCount * rImp) / (leftRowCount + rightRowCount); // if (Double.IsNaN(info) == false) { // minImp = info; // p.SplitValue = AvcLst[thresholdIndexLst[i]].N0; // n.DescendentImpPreCalculated[0] = lImp; // n.DescendentImpPreCalculated[1] = rImp; // } // } else { // info = (leftRowCount * lImp + rightRowCount * rImp) / (leftRowCount + rightRowCount); // if (info < minImp && !Double.IsNaN(info) && leftRowCount >= Def.TreeMinNumberOfCasesPerNode && rightRowCount >= Def.TreeMinNumberOfCasesPerNode) { // minImp = info; // p.SplitValue = AvcLst[thresholdIndexLst[i]].N0; // n.DescendentImpPreCalculated[0] = lImp; // n.DescendentImpPreCalculated[1] = rImp; // } // } // } // if (Double.IsNaN(minImp)) { // p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; // p.Gain = 0; // return 0; // } // List<double> nLst; // sql = @" // SELECT // DISTINCT " + p.Variable.Name + " " + // "FROM " // + Def.DbBsTb + " , " + Def.DbTrTb + n.Id + " " + // "WHERE " // + Def.DbBsTb + "." + Def.DbTableIdName + "=" + // Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + // p.Variable.Name + " IS NOT NULL " + // "ORDER BY " // + p.Variable.Name + " ASC "; // nLst = Def.Db.GetNumberLst(sql); // //Finds the angle bisector of the slit // if (p.SplitValue != nLst[0] && p.SplitValue != nLst[nLst.Count - 2]) { // for (i = 1; i < nLst.Count - 2; ++i) { // if (p.SplitValue == nLst[i]) { // p.SplitValue = (nLst[i - 1] + nLst[i + 1]) / 2; // break; // } // } // } // p.Gain = (n.Imp - minImp) * 100 / n.Imp; // p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; // return p.Gain; // } public static double Killed_MinRandomImp(NodeTargetCategorical n, Predictor p) { //For some partitions gets the min Impiance int valueCount = p.DistinctValuesCount; int pos, instanceI; double partitionCount; int c, i; double bestPartition = 0, minImp = Double.NaN, imp = Double.NaN, lImp = Double.NaN, rImp = Double.NaN; int nodeLfItemCount, nodeRtItemCount; List<NTT> nttLst; string binStr = ""; p.SplitStatus = Predictor.SplitStatusEnum.CanBeUsed; n.DescendentImpPreCalculated = new List<double>(2); n.DescendentImpPreCalculated.Add(0); n.DescendentImpPreCalculated.Add(0); List<string> lComb = new List<string>(valueCount); List<string> rComb = new List<string>(valueCount); SortedList<string, int> lPredVal = new SortedList<string, int>(); SortedList<string, int> rPredVal = new SortedList<string, int>(); string sql = @"SELECT ALL " + " count(*), " + Def.DbBsTb + "." + p.Variable.Name + ", " + Def.DbBsTb + "." + Def.Schema.Target.Name + " " + "FROM " + Def.DbBsTb + "," + Def.DbTrTb + n.Id + " " + "WHERE " + Def.DbBsTb + "." + Def.DbTableIdName + " = " + Def.DbTrTb + n.Id + "." + Def.DbTableIdName + " AND " + Def.DbBsTb + "." + p.Variable.Name + " IS NOT NULL " + "GROUP BY " + Def.DbBsTb + "." + Def.Schema.Target.Name + ", " + Def.DbBsTb + "." + p.Variable.Name + " "; nttLst = Def.Db.GetNTTLst(sql); int instanceCount = n.Table.RowCount; partitionCount = Math.Pow(2, valueCount - 1) - 1; double partitionCountMax = 0; if (partitionCount > 4095) { partitionCountMax = 4095; } else partitionCountMax = partitionCount; List<double> partLst = new List<double>((int)partitionCountMax); i = 1; //CHECK for (int t = 0; t < partitionCountMax; ++t) { partLst.Add((int)RNG.GetUniform(i, partitionCount)); ++i; } for (i = 0; i < partitionCountMax; ++i) { pos = 0; binStr = Fcn.Decimal2BinaryStr(partLst[(int)i]); lComb.Clear(); rComb.Clear(); lPredVal.Clear(); rPredVal.Clear(); nodeLfItemCount = nodeRtItemCount = 0; for (c = 0; c < p.ValueSd.Count; ++c) { if (binStr[binStr.Length - 1 - c] == '1') lComb.Add(p.ValueSd.Keys[pos]); //if the 'case' is in, put it on the left else rComb.Add(p.ValueSd.Keys[pos]);//else, in the right side ++pos; } for (instanceI = 0; instanceI < nttLst.Count; ++instanceI) { foreach (string ls in lComb) { if (nttLst[instanceI].T0 == ls) { if (!lPredVal.ContainsKey(nttLst[instanceI].T1)) lPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); else lPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; nodeLfItemCount += (int)nttLst[instanceI].N; break; } } foreach (string rs in rComb) { if (nttLst[instanceI].T0 == rs) { if (!rPredVal.ContainsKey(nttLst[instanceI].T1)) rPredVal.Add(nttLst[instanceI].T1, (int)nttLst[instanceI].N); else rPredVal[nttLst[instanceI].T1] += (int)nttLst[instanceI].N; nodeRtItemCount += (int)nttLst[instanceI].N; break; } } } if (nodeLfItemCount >= Def.TreeMinNumberOfCasesPerNode && nodeRtItemCount >= Def.TreeMinNumberOfCasesPerNode) { lImp = ImpCat(lPredVal, nodeLfItemCount); rImp = ImpCat(rPredVal, nodeRtItemCount); if (Double.IsNaN(minImp)) { bestPartition = partLst[(int)i]; minImp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } else { imp = (double)(nodeLfItemCount) / instanceCount * lImp + (double)nodeRtItemCount / instanceCount * rImp; if (imp < minImp) { minImp = imp; bestPartition = partLst[(int)i]; n.DescendentImpPreCalculated[0] = lImp; n.DescendentImpPreCalculated[1] = rImp; } } } } if (bestPartition == 0) { p.SplitStatus = Predictor.SplitStatusEnum.NotEnoughCases; p.Gain = 0; return 0; } //Set the possible children ValueGroup valueGroup = new ValueGroup(p, 2); p.ChildrenGroups = valueGroup; pos = 0; binStr = Fcn.Decimal2BinaryStr((double)bestPartition); for (c = 0; c < p.ValueSd.Count; ++c) { // if ((bestPartition & c) == c) if (binStr[binStr.Length - 1 - c] == '1') valueGroup.AddValueFromIndex(pos, 0); //if the 'case' is in put it on the left else valueGroup.AddValueFromIndex(pos, 1);//else, in the right side ++pos; } p.Gain = (n.Imp - minImp) * 100 / n.Imp; p.Gain *= (double)(n.Table.RowCount - p.NullCount) / n.Table.RowCount; return p.Gain; }