Esempio n. 1
0
        private void btMining_Click(object sender, EventArgs e)
        {
            string   _sFileData = txtDatabase.Text.Trim();
            double   alpha      = double.Parse(txtAlpha.Text.Trim());
            double   minDev     = double.Parse(txtMinDev.Text.Trim()) / 100;
            TreeNode tnode      = treeResult.Nodes.Add("alpha = " + txtAlpha.Text + ", minDev = " + txtMinDev.Text + "%");
            Dataset  dt         = readData(_sFileData);
            // 10-fold cross validation
            int       tr_row = (int)Math.Round(dt.row * 0.9);
            int       te_row = dt.row - tr_row;
            Stopwatch sw     = Stopwatch.StartNew();

            if (chkClassification.Checked) // classification
            {
                int      n_run = 10;       // use n_run=1 if only mine exceptional csets
                double[] acc   = new double[n_run];
                double[] fsc   = new double[n_run];
                double[] kap   = new double[n_run];
                for (int run = 0; run < n_run; run++)
                {
                    // reset values
                    nNode             = 0;
                    nIdenticalSup     = 0; nDecreasingSig = 0; nIndependence = 0;
                    LargeCandidates_1 = new List <Node>();
                    LargeCandidates_k = new List <Node>();
                    CSs           = new List <CSet>();
                    Lattice       = new Dictionary <int, Node>();
                    DEFAULT_CLASS = 0;

                    #region generate training and test
                    Dataset train = new Dataset(tr_row, dt.col);
                    train.cls     = dt.cls;
                    train.row_cls = new int[train.cls];
                    Dataset test = new Dataset(te_row, dt.col);
                    test.cls     = dt.cls;
                    test.row_cls = new int[test.cls];
                    Random rnd = new Random(run);
                    int[]  train_idx = Enumerable.Range(0, dt.row).OrderBy(x => rnd.Next()).Take(train.row).ToArray();
                    int    tr_id = 0; int te_id = 0;
                    for (int i = 0; i < dt.row; i++)
                    {
                        if (train_idx.Contains(i))
                        {
                            train.data[tr_id] = dt.data[i];
                            tr_id++;
                        }
                        else
                        {
                            test.data[te_id] = dt.data[i];
                            te_id++;
                        }
                    }
                    // get # transactions in each class
                    for (int i = 0; i < train.row; i++)
                    {
                        for (ushort x = 0; x < train.cls; x++)
                        {
                            if (x == train.data[i][train.col - 1])
                            {
                                train.row_cls[x]++;
                            }
                        }
                    }
                    for (int i = 0; i < test.row; i++)
                    {
                        for (ushort x = 0; x < test.cls; x++)
                        {
                            if (x == test.data[i][test.col - 1])
                            {
                                test.row_cls[x]++;
                            }
                        }
                    }
                    #endregion

                    // use dt if only mine exceptional csets
                    // use train for classification
                    Dataset dt_cs = train;
                    // obtain candidates that containt 1-itemsets
                    findCandidates(dt_cs, minDev);
                    List <Node> Lr = Lattice.Values.ToList();
                    // create lattice
                    CREATE_LATTICE(Lr, minDev, dt_cs.row, dt_cs.cls, dt_cs.row_cls);
                    // find large and non-redundant csets
                    foreach (Node node in Lattice.Values)
                    {
                        FIND_LARGE_CS(node, minDev, dt_cs.row, dt_cs.cls, dt_cs.row_cls);
                    }
                    int nLarge = LargeCandidates_1.Count + LargeCandidates_k.Count;

                    #region generate significant csets using FDR method
                    if (chkSignificant.Checked)
                    {
                        // contrast sets of size 1
                        List <int> CSs_1 = new List <int>();
                        foreach (Node node in LargeCandidates_1)
                        {
                            if (node.p_value <= alpha)
                            {
                                CSet cs = new CSet(dt_cs.cls)
                                {
                                    g_id      = node.id,
                                    C_itemset = node.itemset,
                                    Sup       = (double)node.total / dt_cs.row,
                                    Dev       = node.dev,
                                    Chi       = node.chi
                                };

                                for (ushort x = 0; x < dt_cs.cls; x++)
                                {
                                    double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                                    cs.Pro[x] = pro;
                                }
                                CSs.Add(cs);
                                CSs_1.Add(node.id);
                            }
                        }
                        // contrast sets of size k>1
                        int nLarge_k = LargeCandidates_k.Count;
                        // sort large candidates in ascending order of p-value: very faster than use code
                        List <Node> sortedLargeCandidates_k = LargeCandidates_k.OrderBy(x => x.p_value).ToList();
                        int         kk = 0;
                        for (kk = 0; kk < nLarge_k; kk++)
                        {
                            Node   node = sortedLargeCandidates_k[kk];
                            double fdr  = (double)(kk + 1) * alpha / nLarge_k;
                            if (node.p_value > fdr)
                            {
                                break;
                            }
                        }
                        for (int j = 0; j < kk; j++)
                        {
                            Node node = sortedLargeCandidates_k[j];
                            if (chkExceptional.Checked) // exceptional contrast sets
                            {
                                // this id is not the node id on lattice
                                // it is used for group id
                                int g_node_id = -1;
                                // check dominant group of this node whether it is different than its parent of size 1
                                foreach (Item item in node.itemset)
                                {
                                    if (CSs_1.Contains(item.id))
                                    {
                                        Node node_it = Lattice[item.id];
                                        if (node.max_group != node_it.max_group)
                                        {
                                            g_node_id = item.id;
                                        }
                                        else
                                        {
                                            if (node.chi < node_it.chi)
                                            {
                                                g_node_id = -1;
                                                break;
                                            }
                                        }
                                    }
                                }
                                if (g_node_id != -1)
                                {
                                    CSet cs = new CSet(dt_cs.cls)
                                    {
                                        g_id      = g_node_id,
                                        C_itemset = node.itemset,
                                        Sup       = (double)node.total / dt_cs.row,
                                        Dev       = node.dev,
                                        Chi       = node.chi
                                    };
                                    for (ushort x = 0; x < dt_cs.cls; x++)
                                    {
                                        double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                                        cs.Pro[x] = pro;
                                    }
                                    CSs.Add(cs);
                                }
                            }
                            else // only significant contrast sets
                            {
                                CSet cs = new CSet(dt_cs.cls)
                                {
                                    g_id      = node.id,
                                    C_itemset = node.itemset,
                                    Sup       = (double)node.total / dt_cs.row,
                                    Dev       = node.dev,
                                    Chi       = node.chi
                                };
                                for (ushort x = 0; x < dt_cs.cls; x++)
                                {
                                    double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                                    cs.Pro[x] = pro;
                                }
                                CSs.Add(cs);
                            }
                        }
                    }
                    else // only large contrast sets
                    {
                        // contrast sets of size 1
                        foreach (Node node in LargeCandidates_1)
                        {
                            CSet cs = new CSet(dt_cs.cls)
                            {
                                C_itemset = node.itemset,
                                Sup       = (double)node.total / dt_cs.row,
                                Dev       = node.dev,
                                Chi       = node.chi
                            };

                            for (ushort x = 0; x < dt_cs.cls; x++)
                            {
                                double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                                cs.Pro[x] = pro;
                            }
                            CSs.Add(cs);
                        }
                        // contrast sets of size k>1
                        foreach (Node node in LargeCandidates_k)
                        {
                            CSet cs = new CSet(dt_cs.cls)
                            {
                                C_itemset = node.itemset,
                                Sup       = (double)node.total / dt_cs.row,
                                Dev       = node.dev,
                                Chi       = node.chi
                            };

                            for (ushort x = 0; x < dt_cs.cls; x++)
                            {
                                double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                                cs.Pro[x] = pro;
                            }
                            CSs.Add(cs);
                        }
                    }
                    #endregion

                    #region classification
                    int[] t_class = new int[test.row]; // true class
                    int[] p_class = new int[test.row]; // predicted class
                    for (int i = 0; i < test.row; i++)
                    {
                        int[] test_case     = test.data[i];
                        int   true_class    = test_case[test.col - 1];
                        int   predict_class = assignClass(CSs, test_case, test.cls, test.row_cls);
                        t_class[i] = true_class;
                        p_class[i] = predict_class;
                        if (predict_class == true_class)
                        {
                            acc[run] += 1;
                        }
                    }
                    acc[run] = Math.Round(acc[run] / test.row, 4);
                    ConfusionMatrix cm = new ConfusionMatrix(t_class, p_class);
                    fsc[run] = Math.Round(cm.FScore, 4);
                    kap[run] = Math.Round(cm.Kappa, 4);
                    #endregion
                }
                sw.Stop();
                long timeMining = sw.ElapsedMilliseconds;
                tnode.Nodes.Add("Training time: " + timeMining / 1000.0 + " (s)");
                tnode.Nodes.Add("Default class: " + DEFAULT_CLASS + ". Accuracy: " + acc.Sum() / n_run +
                                ". F1-score: " + fsc.Sum() / n_run + ". Kappa: " + kap.Sum() / n_run);
            }
            else // only mine exceptional csets
            {
                // reset values
                nNode             = 0;
                nIdenticalSup     = 0; nDecreasingSig = 0; nIndependence = 0;
                LargeCandidates_1 = new List <Node>();
                LargeCandidates_k = new List <Node>();
                CSs           = new List <CSet>();
                Lattice       = new Dictionary <int, Node>();
                DEFAULT_CLASS = 0;

                // use dt if only mine exceptional csets
                // use train for classification
                Dataset dt_cs = dt;
                // obtain candidates that containt 1-itemsets
                findCandidates(dt_cs, minDev);
                List <Node> Lr = Lattice.Values.ToList();
                // create lattice
                CREATE_LATTICE(Lr, minDev, dt_cs.row, dt_cs.cls, dt_cs.row_cls);
                // find large and non-redundant csets
                foreach (Node node in Lattice.Values)
                {
                    FIND_LARGE_CS(node, minDev, dt_cs.row, dt_cs.cls, dt_cs.row_cls);
                }
                int nLarge = LargeCandidates_1.Count + LargeCandidates_k.Count;

                #region generate significant csets using FDR method
                if (chkSignificant.Checked)
                {
                    // contrast sets of size 1
                    List <int> CSs_1 = new List <int>();
                    foreach (Node node in LargeCandidates_1)
                    {
                        if (node.p_value <= alpha)
                        {
                            CSet cs = new CSet(dt_cs.cls)
                            {
                                g_id      = node.id,
                                C_itemset = node.itemset,
                                Sup       = (double)node.total / dt_cs.row,
                                Dev       = node.dev,
                                Chi       = node.chi
                            };

                            for (ushort x = 0; x < dt_cs.cls; x++)
                            {
                                double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                                cs.Pro[x] = pro;
                            }
                            CSs.Add(cs);
                            CSs_1.Add(node.id);
                        }
                    }
                    // contrast sets of size k>1
                    int nLarge_k = LargeCandidates_k.Count;
                    // sort large candidates in ascending order of p-value: very faster than use code
                    List <Node> sortedLargeCandidates_k = LargeCandidates_k.OrderBy(x => x.p_value).ToList();
                    int         kk = 0;
                    for (kk = 0; kk < nLarge_k; kk++)
                    {
                        Node   node = sortedLargeCandidates_k[kk];
                        double fdr  = (double)(kk + 1) * alpha / nLarge_k;
                        if (node.p_value > fdr)
                        {
                            break;
                        }
                    }
                    for (int j = 0; j < kk; j++)
                    {
                        Node node = sortedLargeCandidates_k[j];
                        if (chkExceptional.Checked) // exceptional contrast sets
                        {
                            // this id is not the node id on lattice
                            // it is used for group id
                            int g_node_id = -1;
                            // check dominant group of this node whether it is different than its parent of size 1
                            foreach (Item item in node.itemset)
                            {
                                if (CSs_1.Contains(item.id))
                                {
                                    Node node_it = Lattice[item.id];
                                    if (node.max_group != node_it.max_group)
                                    {
                                        g_node_id = item.id;
                                    }
                                    else
                                    {
                                        if (node.chi < node_it.chi)
                                        {
                                            g_node_id = -1;
                                            break;
                                        }
                                    }
                                }
                            }
                            if (g_node_id != -1)
                            {
                                CSet cs = new CSet(dt_cs.cls)
                                {
                                    g_id      = g_node_id,
                                    C_itemset = node.itemset,
                                    Sup       = (double)node.total / dt_cs.row,
                                    Dev       = node.dev,
                                    Chi       = node.chi
                                };
                                for (ushort x = 0; x < dt_cs.cls; x++)
                                {
                                    double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                                    cs.Pro[x] = pro;
                                }
                                CSs.Add(cs);
                            }
                        }
                        else // only significant contrast sets
                        {
                            CSet cs = new CSet(dt_cs.cls)
                            {
                                g_id      = node.id,
                                C_itemset = node.itemset,
                                Sup       = (double)node.total / dt_cs.row,
                                Dev       = node.dev,
                                Chi       = node.chi
                            };
                            for (ushort x = 0; x < dt_cs.cls; x++)
                            {
                                double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                                cs.Pro[x] = pro;
                            }
                            CSs.Add(cs);
                        }
                    }
                }
                else // only large contrast sets
                {
                    // contrast sets of size 1
                    foreach (Node node in LargeCandidates_1)
                    {
                        CSet cs = new CSet(dt_cs.cls)
                        {
                            C_itemset = node.itemset,
                            Sup       = (double)node.total / dt_cs.row,
                            Dev       = node.dev,
                            Chi       = node.chi
                        };

                        for (ushort x = 0; x < dt_cs.cls; x++)
                        {
                            double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                            cs.Pro[x] = pro;
                        }
                        CSs.Add(cs);
                    }
                    // contrast sets of size k>1
                    foreach (Node node in LargeCandidates_k)
                    {
                        CSet cs = new CSet(dt_cs.cls)
                        {
                            C_itemset = node.itemset,
                            Sup       = (double)node.total / dt_cs.row,
                            Dev       = node.dev,
                            Chi       = node.chi
                        };

                        for (ushort x = 0; x < dt_cs.cls; x++)
                        {
                            double pro = (double)node.Obidset[x].Count / dt_cs.row_cls[x];
                            cs.Pro[x] = pro;
                        }
                        CSs.Add(cs);
                    }
                }
                #endregion

                sw.Stop();
                long timeMining = sw.ElapsedMilliseconds;

                #region print results
                tnode.Nodes.Add("Training time: " + timeMining / 1000.0 + " (s)");
                tnode.Nodes.Add("Identical: " + nIdenticalSup + ". Decrease: " + nDecreasingSig + ". Independence: " + nIndependence);
                if (chkSignificant.Checked)
                {
                    if (chkExceptional.Checked)
                    {
                        tnode.Nodes.Add("Large: " + nLarge + ". Exceptional: " + CSs.Count());
                    }
                    else
                    {
                        tnode.Nodes.Add("Large: " + nLarge + ". Significant: " + CSs.Count());
                    }
                }
                else // large csets
                {
                    tnode.Nodes.Add("Large: " + nLarge);
                }

                if (chkOutput.Checked)
                {
                    string _sFileDict = txtDict.Text.Trim();
                    Dictionary <int, string[]> dict = readDict(_sFileDict);
                    if (CSs.Count > 0)
                    {
                        // sort contrast sets based on group id
                        List <CSet> sortedCSs = CSs.OrderBy(x => x.g_id).ToList();
                        string      file_path = Environment.GetFolderPath(Environment.SpecialFolder.Desktop);
                        if (chkSignificant.Checked)
                        {
                            if (chkExceptional.Checked)
                            {
                                file_path += @"\ECSM_Exceptional (" + alpha + ", " + txtMinDev.Text + "%)";
                            }
                            else
                            {
                                file_path += @"\ECSM_Significant (" + alpha + ", " + txtMinDev.Text + "%)";
                            }
                        }
                        else
                        {
                            file_path += @"\ECSM_Large (" + txtMinDev.Text + "%)";
                        }
                        writeCS(file_path + ".csv", dict, sortedCSs, dt_cs.cls);
                    }
                    MessageBox.Show("Finish!");
                }
                #endregion
            }
        }
Esempio n. 2
0
        void findCandidates(Dataset dt, double minDev)
        {
            // array contains items
            Node[] items = new Node[dt.row];
            // real # items
            int nN = 0;
            int k  = 0;

            // for each attribute: i
            for (int i = 0; i < dt.col - 1; i++)
            {
                nN = 0;
                // for each transaction: j
                for (int j = 0; j < dt.row; j++)
                {
                    // check old items
                    for (k = 0; k < nN; k++)
                    {
                        if (items[k].itemset[0].val == dt.data[j][i])
                        {
                            // find Obidset_i
                            items[k].Obidset[dt.data[j][dt.col - 1]].Add(j);
                            break;
                        }
                    }
                    // new item
                    if (k == nN)
                    {
                        Item item = new Item();
                        item.att  = i;
                        item.val  = dt.data[j][i];
                        items[nN] = new Node(dt.cls);
                        items[nN].itemset.Add(item);
                        // find Obidset_i
                        items[nN].Obidset[dt.data[j][dt.col - 1]].Add(j);
                        nN++;
                    }
                }

                for (k = 0; k < nN; k++)
                {
                    Node node = items[k];
                    // compute dev
                    double min_pro = (double)node.Obidset[0].Count / dt.row_cls[0];
                    double max_pro = (double)node.Obidset[0].Count / dt.row_cls[0];
                    for (ushort x = 0; x < dt.cls; x++)
                    {
                        double pro = (double)node.Obidset[x].Count / dt.row_cls[x];
                        if (pro > max_pro)
                        {
                            max_pro        = pro;
                            node.max_group = x;
                        }
                        if (pro < min_pro)
                        {
                            min_pro = pro;
                        }
                        // compute pos
                        if (node.Obidset[x].Count() > node.Obidset[node.pos].Count())
                        {
                            node.pos = x;
                        }
                        // compute total
                        node.total += node.Obidset[x].Count;
                    }
                    // check if this node is large
                    if (max_pro >= minDev)
                    {
                        node.id            = nNode;
                        node.itemset[0].id = nNode;
                        // increase # nodes on lattice
                        nNode++;
                        node.max_pro = max_pro;
                        // compute support diff
                        node.dev = max_pro - min_pro;
                        // convert att to bit representation
                        node.att = (long)Math.Pow(2, i);
                        Lattice.Add(node.id, node);
                    }
                }
            }
        }