private void btMining_Click(object sender, EventArgs e) { string _sFileData = txtDatabase.Text.Trim(); double alpha = double.Parse(txtAlpha.Text.Trim()); double minDev = double.Parse(txtMinDev.Text.Trim()) / 100; TreeNode tnode = treeResult.Nodes.Add("alpha = " + txtAlpha.Text + ", minDev = " + txtMinDev.Text + "%"); Dataset dt = readData(_sFileData); // 10-fold cross validation int tr_row = (int)Math.Round(dt.row * 0.9); int te_row = dt.row - tr_row; Stopwatch sw = Stopwatch.StartNew(); if (chkClassification.Checked) // classification { int n_run = 10; // use n_run=1 if only mine exceptional csets double[] acc = new double[n_run]; double[] fsc = new double[n_run]; double[] kap = new double[n_run]; for (int run = 0; run < n_run; run++) { // reset values nNode = 0; nIdenticalSup = 0; nDecreasingSig = 0; nIndependence = 0; LargeCandidates_1 = new List <Node>(); LargeCandidates_k = new List <Node>(); CSs = new List <CSet>(); Lattice = new Dictionary <int, Node>(); DEFAULT_CLASS = 0; #region generate training and test Dataset train = new Dataset(tr_row, dt.col); train.cls = dt.cls; train.row_cls = new int[train.cls]; Dataset test = new Dataset(te_row, dt.col); test.cls = dt.cls; test.row_cls = new int[test.cls]; Random rnd = new Random(run); int[] train_idx = Enumerable.Range(0, dt.row).OrderBy(x => rnd.Next()).Take(train.row).ToArray(); int tr_id = 0; int te_id = 0; for (int i = 0; i < dt.row; i++) { if (train_idx.Contains(i)) { train.data[tr_id] = dt.data[i]; tr_id++; } else { test.data[te_id] = dt.data[i]; te_id++; } } // get # transactions in each class for (int i = 0; i < train.row; i++) { for (ushort x = 0; x < train.cls; x++) { if (x == train.data[i][train.col - 1]) { train.row_cls[x]++; } } } for (int i = 0; i < test.row; i++) { for (ushort x = 0; x < test.cls; x++) { if (x == test.data[i][test.col - 1]) { test.row_cls[x]++; } } } #endregion // use dt if only mine exceptional csets // use train for classification Dataset dt_cs = train; // obtain candidates that containt 1-itemsets findCandidates(dt_cs, minDev); List <Node> Lr = Lattice.Values.ToList(); // create lattice CREATE_LATTICE(Lr, minDev, dt_cs.row, dt_cs.cls, dt_cs.row_cls); // find large and non-redundant csets foreach (Node node in Lattice.Values) { FIND_LARGE_CS(node, minDev, dt_cs.row, dt_cs.cls, dt_cs.row_cls); } int nLarge = LargeCandidates_1.Count + LargeCandidates_k.Count; #region generate significant csets using FDR method if (chkSignificant.Checked) { // contrast sets of size 1 List <int> CSs_1 = new List <int>(); foreach (Node node in LargeCandidates_1) { if (node.p_value <= alpha) { CSet cs = new CSet(dt_cs.cls) { g_id = node.id, C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); CSs_1.Add(node.id); } } // contrast sets of size k>1 int nLarge_k = LargeCandidates_k.Count; // sort large candidates in ascending order of p-value: very faster than use code List <Node> sortedLargeCandidates_k = LargeCandidates_k.OrderBy(x => x.p_value).ToList(); int kk = 0; for (kk = 0; kk < nLarge_k; kk++) { Node node = sortedLargeCandidates_k[kk]; double fdr = (double)(kk + 1) * alpha / nLarge_k; if (node.p_value > fdr) { break; } } for (int j = 0; j < kk; j++) { Node node = sortedLargeCandidates_k[j]; if (chkExceptional.Checked) // exceptional contrast sets { // this id is not the node id on lattice // it is used for group id int g_node_id = -1; // check dominant group of this node whether it is different than its parent of size 1 foreach (Item item in node.itemset) { if (CSs_1.Contains(item.id)) { Node node_it = Lattice[item.id]; if (node.max_group != node_it.max_group) { g_node_id = item.id; } else { if (node.chi < node_it.chi) { g_node_id = -1; break; } } } } if (g_node_id != -1) { CSet cs = new CSet(dt_cs.cls) { g_id = g_node_id, C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); } } else // only significant contrast sets { CSet cs = new CSet(dt_cs.cls) { g_id = node.id, C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); } } } else // only large contrast sets { // contrast sets of size 1 foreach (Node node in LargeCandidates_1) { CSet cs = new CSet(dt_cs.cls) { C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); } // contrast sets of size k>1 foreach (Node node in LargeCandidates_k) { CSet cs = new CSet(dt_cs.cls) { C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); } } #endregion #region classification int[] t_class = new int[test.row]; // true class int[] p_class = new int[test.row]; // predicted class for (int i = 0; i < test.row; i++) { int[] test_case = test.data[i]; int true_class = test_case[test.col - 1]; int predict_class = assignClass(CSs, test_case, test.cls, test.row_cls); t_class[i] = true_class; p_class[i] = predict_class; if (predict_class == true_class) { acc[run] += 1; } } acc[run] = Math.Round(acc[run] / test.row, 4); ConfusionMatrix cm = new ConfusionMatrix(t_class, p_class); fsc[run] = Math.Round(cm.FScore, 4); kap[run] = Math.Round(cm.Kappa, 4); #endregion } sw.Stop(); long timeMining = sw.ElapsedMilliseconds; tnode.Nodes.Add("Training time: " + timeMining / 1000.0 + " (s)"); tnode.Nodes.Add("Default class: " + DEFAULT_CLASS + ". Accuracy: " + acc.Sum() / n_run + ". F1-score: " + fsc.Sum() / n_run + ". Kappa: " + kap.Sum() / n_run); } else // only mine exceptional csets { // reset values nNode = 0; nIdenticalSup = 0; nDecreasingSig = 0; nIndependence = 0; LargeCandidates_1 = new List <Node>(); LargeCandidates_k = new List <Node>(); CSs = new List <CSet>(); Lattice = new Dictionary <int, Node>(); DEFAULT_CLASS = 0; // use dt if only mine exceptional csets // use train for classification Dataset dt_cs = dt; // obtain candidates that containt 1-itemsets findCandidates(dt_cs, minDev); List <Node> Lr = Lattice.Values.ToList(); // create lattice CREATE_LATTICE(Lr, minDev, dt_cs.row, dt_cs.cls, dt_cs.row_cls); // find large and non-redundant csets foreach (Node node in Lattice.Values) { FIND_LARGE_CS(node, minDev, dt_cs.row, dt_cs.cls, dt_cs.row_cls); } int nLarge = LargeCandidates_1.Count + LargeCandidates_k.Count; #region generate significant csets using FDR method if (chkSignificant.Checked) { // contrast sets of size 1 List <int> CSs_1 = new List <int>(); foreach (Node node in LargeCandidates_1) { if (node.p_value <= alpha) { CSet cs = new CSet(dt_cs.cls) { g_id = node.id, C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); CSs_1.Add(node.id); } } // contrast sets of size k>1 int nLarge_k = LargeCandidates_k.Count; // sort large candidates in ascending order of p-value: very faster than use code List <Node> sortedLargeCandidates_k = LargeCandidates_k.OrderBy(x => x.p_value).ToList(); int kk = 0; for (kk = 0; kk < nLarge_k; kk++) { Node node = sortedLargeCandidates_k[kk]; double fdr = (double)(kk + 1) * alpha / nLarge_k; if (node.p_value > fdr) { break; } } for (int j = 0; j < kk; j++) { Node node = sortedLargeCandidates_k[j]; if (chkExceptional.Checked) // exceptional contrast sets { // this id is not the node id on lattice // it is used for group id int g_node_id = -1; // check dominant group of this node whether it is different than its parent of size 1 foreach (Item item in node.itemset) { if (CSs_1.Contains(item.id)) { Node node_it = Lattice[item.id]; if (node.max_group != node_it.max_group) { g_node_id = item.id; } else { if (node.chi < node_it.chi) { g_node_id = -1; break; } } } } if (g_node_id != -1) { CSet cs = new CSet(dt_cs.cls) { g_id = g_node_id, C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); } } else // only significant contrast sets { CSet cs = new CSet(dt_cs.cls) { g_id = node.id, C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); } } } else // only large contrast sets { // contrast sets of size 1 foreach (Node node in LargeCandidates_1) { CSet cs = new CSet(dt_cs.cls) { C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); } // contrast sets of size k>1 foreach (Node node in LargeCandidates_k) { CSet cs = new CSet(dt_cs.cls) { C_itemset = node.itemset, Sup = (double)node.total / dt_cs.row, Dev = node.dev, Chi = node.chi }; for (ushort x = 0; x < dt_cs.cls; x++) { double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x]; cs.Pro[x] = pro; } CSs.Add(cs); } } #endregion sw.Stop(); long timeMining = sw.ElapsedMilliseconds; #region print results tnode.Nodes.Add("Training time: " + timeMining / 1000.0 + " (s)"); tnode.Nodes.Add("Identical: " + nIdenticalSup + ". Decrease: " + nDecreasingSig + ". Independence: " + nIndependence); if (chkSignificant.Checked) { if (chkExceptional.Checked) { tnode.Nodes.Add("Large: " + nLarge + ". Exceptional: " + CSs.Count()); } else { tnode.Nodes.Add("Large: " + nLarge + ". Significant: " + CSs.Count()); } } else // large csets { tnode.Nodes.Add("Large: " + nLarge); } if (chkOutput.Checked) { string _sFileDict = txtDict.Text.Trim(); Dictionary <int, string[]> dict = readDict(_sFileDict); if (CSs.Count > 0) { // sort contrast sets based on group id List <CSet> sortedCSs = CSs.OrderBy(x => x.g_id).ToList(); string file_path = Environment.GetFolderPath(Environment.SpecialFolder.Desktop); if (chkSignificant.Checked) { if (chkExceptional.Checked) { file_path += @"\ECSM_Exceptional (" + alpha + ", " + txtMinDev.Text + "%)"; } else { file_path += @"\ECSM_Significant (" + alpha + ", " + txtMinDev.Text + "%)"; } } else { file_path += @"\ECSM_Large (" + txtMinDev.Text + "%)"; } writeCS(file_path + ".csv", dict, sortedCSs, dt_cs.cls); } MessageBox.Show("Finish!"); } #endregion } }
void findCandidates(Dataset dt, double minDev) { // array contains items Node[] items = new Node[dt.row]; // real # items int nN = 0; int k = 0; // for each attribute: i for (int i = 0; i < dt.col - 1; i++) { nN = 0; // for each transaction: j for (int j = 0; j < dt.row; j++) { // check old items for (k = 0; k < nN; k++) { if (items[k].itemset[0].val == dt.data[j][i]) { // find Obidset_i items[k].Obidset[dt.data[j][dt.col - 1]].Add(j); break; } } // new item if (k == nN) { Item item = new Item(); item.att = i; item.val = dt.data[j][i]; items[nN] = new Node(dt.cls); items[nN].itemset.Add(item); // find Obidset_i items[nN].Obidset[dt.data[j][dt.col - 1]].Add(j); nN++; } } for (k = 0; k < nN; k++) { Node node = items[k]; // compute dev double min_pro = (double)node.Obidset[0].Count / dt.row_cls[0]; double max_pro = (double)node.Obidset[0].Count / dt.row_cls[0]; for (ushort x = 0; x < dt.cls; x++) { double pro = (double)node.Obidset[x].Count / dt.row_cls[x]; if (pro > max_pro) { max_pro = pro; node.max_group = x; } if (pro < min_pro) { min_pro = pro; } // compute pos if (node.Obidset[x].Count() > node.Obidset[node.pos].Count()) { node.pos = x; } // compute total node.total += node.Obidset[x].Count; } // check if this node is large if (max_pro >= minDev) { node.id = nNode; node.itemset[0].id = nNode; // increase # nodes on lattice nNode++; node.max_pro = max_pro; // compute support diff node.dev = max_pro - min_pro; // convert att to bit representation node.att = (long)Math.Pow(2, i); Lattice.Add(node.id, node); } } } }