Пример #1
0
        public Cluster Merge(Cluster C1, Cluster C2, string[] arr_label, double distanceThreshold, string[] arr_input_file, double[,] Dmatrix, string distance_type)
        {

            if (C1 == null || C2 == null)
            {
                return null;
            }

            if (InterClusterDistance(C1, C2, arr_input_file, Dmatrix, distance_type) >= distanceThreshold)
            {
                return null;
            }

            Cluster C = new Cluster();

            foreach (Sequence list in C1.ListOfSeq)
            {
                C.ListOfSeq.Add(list);
            }
            foreach (Sequence list in C2.ListOfSeq)
            {
                C.ListOfSeq.Add(list);
            }
            C.arr_class_proportion = C.computeClassProportion(C.ListOfSeq, arr_label);
            C.purity = C.computePurity(C.arr_class_proportion);
            C.majority_class = C.getMajorityClass(arr_label);

            //System.Console.WriteLine("MergedclusterID" + C.clusterID + "\r\n -->cluster 1ID: " + C1.clusterID + "-->cluster 2ID:" + C2.clusterID + " \r\n merged majority class-->" + C.majority_class);
            //Console.Read();

            return C;
        }
Пример #2
0
        public List<Cluster> GabrielNeighbor(Cluster C, List<Cluster> ListOfClusters, string[] arr_input_file, double[,] Dmatrix, string distance_type)
        {

            /*given a cluster C of one element, this function return all the G-neighbor of C */
            List<Cluster> gabrielGpList = new List<Cluster>();

            foreach (Cluster Cnbr in ListOfClusters)
            {
                if (C.clusterID == Cnbr.clusterID)
                    continue;

                bool isNeighbor = true;
                //double distance = InterClusterDistance(C, Clust, arr_input_file, Dmatrix, distance_type);
                double distance_C_Cnbr = InterClusterDistance(C, Cnbr, arr_input_file, Dmatrix, distance_type);
                foreach (Cluster Ci in ListOfClusters)
                {
                    if (Cnbr.clusterID != Ci.clusterID && C.clusterID != Ci.clusterID)
                    {
                        double distance_C_Ci = InterClusterDistance(C, Ci, arr_input_file, Dmatrix, distance_type);
                        double distance_Ci_Cnbr = InterClusterDistance(Ci, Cnbr, arr_input_file, Dmatrix, distance_type);

                        if (Math.Pow(distance_C_Cnbr, 2) >= Math.Pow(distance_C_Ci, 2) + Math.Pow(distance_Ci_Cnbr, 2))
                        {
                            isNeighbor = false;
                            break;
                        }
                    }
                }
                if (isNeighbor == true)
                {
                    gabrielGpList.Add(Cnbr);
                }
            }


            return gabrielGpList;
        }
Пример #3
0
        static void Main(string[] args)
        {
            /*Step1--------------BEGIN Import Distance Matrix and Dataset-------------------------------------*/
            //args[0]: Path to input dataset
            //args[1]: Path to similarity Matrix

            //--------------PRUNNING purity threshold -------------------
            double thetha = 85;
            //----------------------------------------------------------

            string input_file_path;
            string pruned_tree_path;
            string newick_path;
            string ouput_file_path;
            string distance_matix_path;
            int iteration;
            System.Console.WriteLine(DateTime.Now.ToString("hh:mm:ss tt"));
            string distance_type = "AVERAGELINK";
            string graphType = "GabrielGraph"; //or set as "OneNN"

            string str_dendogram = "iterationID,merdeCluster iterationID,clusterID,purity,child1ID,child2ID,d(child1;Parent),d(child2;Parent),intraD(mergedcluster)";

            int cluster_count = 0;
            if (args.Length > 0)
                input_file_path = args[0];
            else
            {
                //input_file_path = "C:\\Users\\paul\\Desktop\\STAXAC\\inputData\\TEST\\test_file1.csv";
                // input_file_path = "C:\\Users\\paul\\Desktop\\NJ\\inputData\\BFE\\m_BFE_file.csv";
                input_file_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\m_BFH_file.csv";
            }

            if (args.Length > 1)
                distance_matix_path = args[1];
            else
            {
                //distance_matix_path = "C:\\Users\\paul\\Desktop\\STAXAC\\inputData\\TEST\\m_test1.csv";
                //distance_matix_path = "C:\\Users\\paul\\Desktop\\NJ\\inputData\\BFE\\m_BFE.csv";
                distance_matix_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\m_BFH.csv";
            }

            if (args.Length > 2)
                ouput_file_path = args[2];
            else
            {
                //ouput_file_path = "C:\\Users\\paul\\Desktop\\STAXAC\\inputData\\TEST\\m_test1_output3.csv";
                ouput_file_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\m_BFH_output.csv";
            }

            // pruned_tree_path = "C:\\Users\\paul\\Desktop\\STAXAC\\inputData\\TEST\\str_editedTree_" + thetha + "3.csv";
            pruned_tree_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\str_editedTree_" + thetha + "3.csv";
            newick_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\newick_" + thetha + "3.csv";
            string depth_purity_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\depth_purity_" + thetha + "3.csv";

            /* Import dataset */
            string[] arr_input_file = System.IO.File.ReadAllLines(input_file_path);
            string[] arr_distance_matrix = System.IO.File.ReadAllLines(distance_matix_path);
            int root_Data_size = arr_input_file.Length;
            int distance_matrix_size = arr_distance_matrix.Length;
            double distanceThreshold = 1.0; //default set distance threshold equal to one

            Sequence[] matrixSequence = new Sequence[root_Data_size]; //used as temporary variable for convenience 
            Cluster[] arr_mergedCluster = new Cluster[arr_input_file.Length - 1];//contains the merged cluster
            Cluster[] tree = new Cluster[2 * arr_input_file.Length - 1];//contains ALL the clusters bottom up; initial clusters + merged clusters; this means contain the tree

            //double checking if input is correct
            string error_msg;
            error_msg = "Run successfully";
            if (root_Data_size != distance_matrix_size)
            {
                error_msg = "ERROR 001: Input file and distance matrix do not match in size";
                System.Console.WriteLine(error_msg);
                Console.Read();
                System.Environment.Exit(0);
            }
            /*---------------END Import Distance Matrix and Dataset----------------------------------------------*/

            /*STEP2:-----------Fill matrices---------------------------------------------------------------------*/
            //label vector for each line
            string[] arr_line_label = new string[arr_input_file.Length];
            string[] arr_species = new string[arr_input_file.Length];
            string str = "";
            for (int i = 0; i < arr_input_file.Length; i++)
            {
                string[] arr_temp = arr_input_file[i].Split(new[] { ',' });

                if (str == "")
                {
                    str = arr_temp[arr_temp.Length - 1];
                }
                else
                {
                    Boolean add_to_list = true;
                    foreach (string label in arr_line_label)
                    {
                        if (label == arr_temp[arr_temp.Length - 1])
                            add_to_list = false;
                    }
                    if (add_to_list)
                        str = str + ";" + arr_temp[arr_temp.Length - 1];
                }

                arr_species[i] = arr_temp[0] + " / " + arr_temp[arr_temp.Length - 1];
                arr_line_label[i] = arr_temp[arr_temp.Length - 1];
            }

            //get label vector. Distinct label values
            string[] arr_label = str.Split(new[] { ';' });

            double[,] Dmatrix = new double[distance_matrix_size, distance_matrix_size];
            for (int i = 0; i < distance_matrix_size; i++)
            {
                string[] arr_temp = arr_distance_matrix[i].Split(new[] { ',' });
                for (int j = 0; j < arr_temp.Length; j++)
                {
                    Dmatrix[i, j] = Convert.ToDouble(arr_temp[j]);
                }
            }


            /* STEP3:-----CREATE OBJECTS---------------------------------------------*/
            //- create sequences and initial sequence matrix
            for (int i = 0; i < arr_input_file.Length; i++)
            {
                //arr_input_file[i]
                Sequence s = new Sequence();
                s.sequencePos = i;
                s.sequenceValue = arr_input_file[i];//including the class label. need to separate them later
                s.label = arr_line_label[i];

                if (s != null)
                {
                    matrixSequence[i] = s;
                }
            }


            iteration = -1;
            //- Create initial Clusters-Start with a clustering X of one-object clusters 
            ClusterOperation O = new ClusterOperation();
            List<Cluster> ListOfClusters = new List<Cluster>();
            for (int i = 0; i < matrixSequence.Length; i++)
            {
                Cluster c = new Cluster();
                List<Sequence> ListOfSeq = new List<Sequence>();
                ListOfSeq.Add(matrixSequence[i]);//list of one element initially

                c.ListOfSeq.Add(matrixSequence[i]);
                cluster_count = cluster_count + 1;
                c.clusterID = cluster_count;
                c.intraClusterDistance = 0.0;
                c.listOfMedoid.Add(matrixSequence[i]);
                c.arr_class_proportion = c.computeClassProportion(ListOfSeq, arr_label);
                c.purity = c.computePurity(c.arr_class_proportion);
                c.child1ID = -1;
                c.child2ID = -1;
                c.label = arr_line_label[i];
                c.majority_class = arr_line_label[i];
                ListOfClusters.Add(c);
                iteration = iteration + 1;
                tree[iteration] = c;
                // add to tree 

                //for flat file only
                str_dendogram = str_dendogram + "\r\n" + cluster_count + "," + "0" + "," + c.clusterID + "," + c.purity + "," + c.child1ID + "," + c.child2ID + ",-1" + ",-1";
            }

            /*-----------------------------------------------------------------*/

            //at this point we have all leaf-cluster. Each cluster has only one element
            /*Step 2: next we need to find the Gabriel Neighbors of each 1-exmple cluster
             */
            for (int i = 0; i < ListOfClusters.Count; i++)
            {
                // System.Console.WriteLine("xxx before: " + DateTime.Now.ToString("hh:mm:ss tt") + "  " + i);

                if (graphType == "OneNN")
                {
                    ListOfClusters[i].OneNN = O.OneNN(ListOfClusters[i], ListOfClusters, arr_input_file, Dmatrix, distance_type);//contains the 1NN of cluster 
                }
                else if (graphType == "GabrielGraph")
                {
                    ListOfClusters[i].GN = O.GabrielNeighbor(ListOfClusters[i], ListOfClusters, arr_input_file, Dmatrix, distance_type);//contains the gabriel graph cluster
                }
            }

            foreach (Cluster cluster in ListOfClusters)
            {
                System.Console.WriteLine("\r\n \r\n clusterID: " + cluster.clusterID);
                System.Console.WriteLine("===One Nearest Neighbors List====>");

                for (int j = 0; j < cluster.GN.Count; j++)
                    System.Console.Write("cluster.GN[j]:" + cluster.GN[j].clusterID + ":");
            }
            System.Console.WriteLine("");
            Console.Read();


            /* 
             * Merge-Candidate(C’,C)that is (1NNX(C’)=C or 1NNX(C )=C’)
             */
            int mergedC_iteration = -1;
            //System.Console.WriteLine("ListOfClusters.Count -->" + ListOfClusters.Count);
            int NUM_OF_STEPS = ListOfClusters.Count;

            while (ListOfClusters.Count > 1)
            {
                Cluster[] best_GN_cluster = new Cluster[ListOfClusters.Count];
                double[] best_GN_mergedPurity = new double[ListOfClusters.Count];
                double[] best_GN_distance = new double[ListOfClusters.Count];

                /* merge a cluster and its closest G-neighbor and compute impurity growth. 
                 * Retain the merge that yields lowest impurity growth */

                for (int i = 0; i < ListOfClusters.Count; i++)
                {
                    double temp_max_purity = -1.0;
                    foreach (Cluster g_neighor in ListOfClusters[i].GN)
                    {

                        Cluster C = O.Merge(ListOfClusters[i], g_neighor, arr_label, distanceThreshold, arr_input_file, Dmatrix, distance_type);//this operation creates a new cluster
                        if (C != null && (temp_max_purity < C.purity))
                        {
                            best_GN_mergedPurity[i] = C.purity;
                            best_GN_cluster[i] = g_neighor;//the best 1NN is here
                            temp_max_purity = C.purity;
                            //best_1NN_distance[i] = O.InterClusterDistance(ListOfClusters[i], oneNNClist, arr_input_file, Dmatrix, distance_type);
                        }
                        //System.Console.WriteLine(ListOfClusters[i].clusterID + ":" + oneNNClist.clusterID + " : Merge purity: " + C.purity);
                    }
                }

                //now we need to select the best cluster to merge
                double max_purity = -1.0;
                int best_i = -1;
                for (int i = 0; i < ListOfClusters.Count; i++)
                {
                    if (max_purity < best_GN_mergedPurity[i])
                    {
                        max_purity = best_GN_mergedPurity[i];
                        best_i = i;
                    }
                }

                //we need to construct a new ListOfClusters excluding ListOfClusters[best_i], and best_GN_cluster[best_i]
                List<Cluster> ListOfClustersClone = new List<Cluster>();
                for (int i = 0; i < ListOfClusters.Count; i++)
                {
                    ListOfClustersClone.Add(ListOfClusters[i]);
                }

                Cluster best_merged_Cluster = new Cluster();
                //best_merged_Cluster = ListOfClusters[best_i];

                best_merged_Cluster = O.Merge(ListOfClustersClone[best_i], best_GN_cluster[best_i], arr_label, distanceThreshold, arr_input_file, Dmatrix, distance_type);//this operation create a new cluster
                cluster_count = cluster_count + 1;
                best_merged_Cluster.clusterID = cluster_count;
                best_merged_Cluster.child1ID = ListOfClustersClone[best_i].clusterID;
                best_merged_Cluster.child2ID = best_GN_cluster[best_i].clusterID;

                best_merged_Cluster.arr_class_proportion = best_merged_Cluster.computeClassProportion(best_merged_Cluster.ListOfSeq, arr_label);
                best_merged_Cluster.purity = best_merged_Cluster.computePurity(best_merged_Cluster.arr_class_proportion);

                ListOfClusters.Remove(ListOfClustersClone[best_i]);
                ListOfClusters.Remove(best_GN_cluster[best_i]);
                if (best_merged_Cluster.GN.Count == 0)
                {
                    if (graphType == "OneNN")
                    {
                        best_merged_Cluster.OneNN = O.OneNN(best_merged_Cluster, ListOfClusters, arr_input_file, Dmatrix, distance_type);
                    }
                    else if (graphType == "GabrielGraph")
                    {
                        best_merged_Cluster.GN = O.GabrielGraph(best_merged_Cluster, ListOfClusters, arr_input_file, Dmatrix, distance_type);
                    }
                }

                //update 1NN list
                for (int i = 0; i < ListOfClusters.Count; i++)
                {

                    for (int j = 0; j < ListOfClusters[i].GN.Count; j++)
                    {

                        if (ListOfClusters[i].GN[j].clusterID == ListOfClustersClone[best_i].clusterID || ListOfClusters[i].GN[j].clusterID == best_GN_cluster[best_i].clusterID)
                            ListOfClusters[i].GN[j] = best_merged_Cluster;

                    }

                }

                ListOfClusters.Add(best_merged_Cluster);

                // add this to tree

                iteration = iteration + 1;
                tree[iteration] = best_merged_Cluster;
                mergedC_iteration = mergedC_iteration + 1;
                arr_mergedCluster[mergedC_iteration] = best_merged_Cluster; // this does not contain the singleton clusters. It only contain the merged clusters

            }

            System.Console.WriteLine(DateTime.Now.ToString("hh:mm:ss tt"));
            System.Console.WriteLine("");
            System.Console.WriteLine("--------------END-----------------");
            Console.Read();

            string[] arr_Newick = new string[arr_line_label.Length + arr_mergedCluster.Length];
            for (int i = 0; i < arr_line_label.Length; i++)
            {
                arr_Newick[i] = arr_species[i];
            }

            int int_count = arr_line_label.Length - 1;

            for (int iter = 0; iter < arr_mergedCluster.Length; iter++)
            {
                //temp_count = temp_count + 1;
                string child1 = "";
                string purity1 = "";
                double d1 = 0.0;
                double intra_d = 0.0;//average intra cluster distance 
                int child_clustr_indice = 0;

                if (arr_mergedCluster[iter].child1ID < arr_line_label.Length)
                {
                    // child1=(arr_line_label[arr_mergedCluster[iter].child1ID -1]).ToString();
                    child1 = (arr_species[arr_mergedCluster[iter].child1ID - 1]).ToString();
                    purity1 = "1.0";
                }
                else
                {
                    child1 = arr_Newick[arr_mergedCluster[iter].child1ID - 1];
                    child_clustr_indice = arr_mergedCluster[iter].child1ID - 1;
                    purity1 = (Math.Round(tree[child_clustr_indice].purity, 3)).ToString();
                }
                //ClusterOperation O = new ClusterOperation();

                //intra cluster distance
                // intra_d = O.IntraClusterDistance(arr_mergedCluster[iter], arr_input_file, Dmatrix, distance_type);

                d1 = O.InterClusterDistance(arr_mergedCluster[iter], tree[arr_mergedCluster[iter].child1ID - 1], arr_input_file, Dmatrix, distance_type);
                d1 = Math.Round(d1, 3);

                string child2 = "";
                string purity2 = "";
                double d2 = 0.0;
                if (arr_mergedCluster[iter].child2ID < arr_line_label.Length)
                {
                    // child2 = (arr_line_label[arr_mergedCluster[iter].child2ID-1]).ToString();
                    child2 = (arr_species[arr_mergedCluster[iter].child2ID - 1]).ToString();
                    purity2 = "1.0";
                }
                else
                {
                    child2 = arr_Newick[arr_mergedCluster[iter].child2ID - 1];
                    child_clustr_indice = arr_mergedCluster[iter].child2ID - 1;
                    purity2 = (Math.Round(tree[child_clustr_indice].purity, 3)).ToString();
                }
                d2 = O.InterClusterDistance(arr_mergedCluster[iter], tree[arr_mergedCluster[iter].child2ID - 1], arr_input_file, Dmatrix, distance_type);
                d2 = Math.Round(d2, 3);
                int_count = int_count + 1;
                string thisClusterPurity = "";
                thisClusterPurity = (Math.Round(arr_mergedCluster[iter].purity, 3)).ToString();

                arr_Newick[int_count] = "(" + child1 + ":" + d1 + "," + child2 + ":" + d2 + ")P=" + thisClusterPurity;
                str_dendogram = str_dendogram + "\r\n" + (iter + arr_line_label.Length + 1) + "," + arr_mergedCluster[iter].clusterID + "," + arr_mergedCluster[iter].clusterID + "," + arr_mergedCluster[iter].purity + "," + arr_mergedCluster[iter].child1ID + "," + arr_mergedCluster[iter].child2ID + "," + d1 + "," + d2 + "," + intra_d;
                System.IO.File.WriteAllText(ouput_file_path, str_dendogram);

            }
            string str_Newick = "";
            string str_Newick_temp = "";
            bool first_time = true;

            for (int iter = arr_line_label.Length; iter < arr_Newick.Length; iter++)
            {
                if (first_time)
                {
                    str_Newick = str_Newick + arr_Newick[iter];
                    str_Newick_temp = "\"" + arr_Newick[iter] + ";\"";
                    first_time = false;
                }
                else
                {
                    if (iter == (arr_Newick.Length - 1))
                        str_Newick = "\"" + arr_Newick[iter] + ";\"";
                    else
                        str_Newick_temp = "\"" + arr_Newick[iter] + ";\"";
                }
            }
            System.IO.File.WriteAllText(newick_path, str_Newick);



            //postprocessing; pruning. This is based on user-defined purity threshold thetha. CLone the tree
            Cluster[] tree_temp = new Cluster[2 * arr_input_file.Length - 1];//contains ALL the clusters bottom up; initial clusters + merged clusters; this means contain the tree
            for (int iter = 0; iter < tree_temp.Length; iter++)
            {
                Cluster new_c = new Cluster();
                new_c.child1ID = tree[iter].child1ID;
                new_c.child2ID = tree[iter].child2ID;
                new_c.ListOfSeq = tree[iter].ListOfSeq;
                new_c.clusterID = tree[iter].clusterID;
                new_c.intraClusterDistance = tree[iter].intraClusterDistance;
                new_c.label = tree[iter].label;
                new_c.listOfMedoid = tree[iter].listOfMedoid;
                new_c.node_type = tree[iter].node_type;
                new_c.GN = tree[iter].GN;
                new_c.purity = tree[iter].purity;
                new_c.arr_class_proportion = tree[iter].arr_class_proportion;
                new_c.majority_class = tree[iter].majority_class;
                new_c.in_reduced_tree = "N";
                tree_temp[iter] = new_c; //make a copy           

            }


            for (int iter = arr_line_label.Length; iter < tree_temp.Length; iter++)
            {
                // the leaf-nodes have child1ID=-1 and child2ID=-1
                if (100 * tree_temp[iter].purity < thetha)//then the children are leaf-nodes.
                {
                    if (tree_temp[iter].in_reduced_tree != "Y")
                    {
                        tree_temp[iter].in_reduced_tree = "Y";
                        System.Console.WriteLine("tree_temp[iter].purity: " + tree_temp[iter].purity);
                        System.Console.WriteLine("tree_temp[tree_temp[iter].child1ID - 1]: " + tree_temp[tree_temp[iter].child1ID - 1].purity + " nodeid: " + tree_temp[tree_temp[iter].child1ID - 1].clusterID);
                        System.Console.WriteLine("tree_temp[tree_temp[iter].child2ID - 1]: " + tree_temp[tree_temp[iter].child2ID - 1].purity + " nodeid: " + tree_temp[tree_temp[iter].child2ID - 1].clusterID);

                        if (100 * tree_temp[tree_temp[iter].child1ID - 1].purity >= thetha || 100 * tree_temp[tree_temp[iter].child2ID - 1].purity >= thetha)
                        {
                            tree_temp[tree_temp[iter].child1ID - 1].node_type = "L";
                            tree_temp[tree_temp[iter].child2ID - 1].node_type = "L";
                            // tree_temp[tree_temp[iter].child1ID - 1].child1ID = -1;
                            // tree_temp[tree_temp[iter].child1ID - 1].child2ID = -1;
                            tree_temp[tree_temp[iter].child1ID - 1].in_reduced_tree = "Y";
                            tree_temp[tree_temp[iter].child2ID - 1].in_reduced_tree = "Y";
                        }
                    }
                }
            }

            int count_reduced_node = 0;
            string str_reduced_node = "";
            first_time = true;
            for (int iter = 0; iter < tree_temp.Length; iter++)
            {
                if (tree_temp[iter].in_reduced_tree == "Y")
                {
                    count_reduced_node = count_reduced_node + 1;
                    if (first_time)
                    {
                        str_reduced_node = tree_temp[iter].clusterID.ToString();
                        first_time = false;
                    }
                    else
                        str_reduced_node = str_reduced_node + "\r\n" + tree_temp[iter].clusterID.ToString();
                }

            }

            System.IO.File.WriteAllText(pruned_tree_path, str_reduced_node);
            /*------------END----------------*/



            /*  ------Purity of each depth-----------------------*/

            List<Cluster> treeClone11 = new List<Cluster>();
            List<Cluster> treeClone12 = new List<Cluster>();

            // foreach (Node node in tree)
            for (int iter = 0; iter < tree.Length; iter++)
            {
                Cluster new_c = new Cluster();
                new_c.child1ID = tree[iter].child1ID;
                new_c.child2ID = tree[iter].child2ID;
                new_c.ListOfSeq = tree[iter].ListOfSeq;
                new_c.clusterID = tree[iter].clusterID;
                new_c.intraClusterDistance = tree[iter].intraClusterDistance;
                new_c.label = tree[iter].label;
                new_c.listOfMedoid = tree[iter].listOfMedoid;
                new_c.node_type = tree[iter].node_type;
                new_c.GN = tree[iter].GN;
                new_c.purity = tree[iter].purity;
                new_c.arr_class_proportion = tree[iter].arr_class_proportion;
                new_c.majority_class = tree[iter].majority_class;
                for (int i = 0; i < tree_temp.Length; i++)
                {
                    if (tree[iter].clusterID == tree_temp[i].child1ID || tree[iter].clusterID == tree_temp[i].child2ID)
                    {
                        new_c.parentID = tree_temp[i].clusterID;
                        break;
                    }
                }

                treeClone11.Add(new_c); //make a copy 
            }

            string str_debug = "nodeID,parentID,Purity" + "\r\n";
            for (int iter = 0; iter < treeClone11.Count; iter++)
            {
                treeClone12.Add(tree[iter]);
                System.Console.WriteLine("\r\n treeClone11[i]: " + treeClone11[iter].clusterID + " parent :" + treeClone11[iter].parentID + " cluster purity :" + treeClone11[iter].purity);
                str_debug = str_debug + treeClone11[iter].clusterID + "," + treeClone11[iter].parentID + "," + treeClone11[iter].purity + "\r\n";
            }

            bool bln_continue = true;
            List<Cluster> depth_list = new List<Cluster>();
            List<Cluster> depth_listClone = new List<Cluster>();
            List<double> depth_purity = new List<Double>();
            double d_purity = -1;

            int test = treeClone11[treeClone11.Count - 1].clusterID;
            int testP = treeClone11[treeClone11.Count - 1].parentID;

            depth_list.Add(treeClone11[treeClone11.Count - 1]);
            depth_listClone.Add(treeClone11[treeClone11.Count - 1]);

            depth_purity.Add(treeClone11[treeClone11.Count - 2].purity);

            int depth_count = 0;
            string str_depth_purity = "Depth#,NodeIDs,%Node, Purity" + "\r\n";
            bool leaf_node = true;
            int subFamilyCount = 0;

            while (bln_continue)
            {
                bln_continue = false;
                List<Cluster> this_depth = new List<Cluster>();

                foreach (Cluster parent in depth_list)
                {
                    int k = 0;
                    bool bln_break = false;
                    leaf_node = true;
                    for (int i = treeClone11.Count - 1; i >= 0; i--) //nodeID
                    {

                        if (treeClone11[i].parentID == parent.clusterID)
                        {
                            leaf_node = false;
                            k = k + 1;
                            this_depth.Add(treeClone11[i]);
                            if ((treeClone11[i].majority_class).Trim() != (parent.majority_class).Trim())
                            {
                                subFamilyCount = subFamilyCount + 1;
                            }

                            if (k == 2)
                            {
                                k = 0;
                                if (depth_listClone.Count > 0)
                                    depth_listClone.Remove(parent);

                                if (depth_listClone.Count == 0)
                                {
                                    bln_break = true;
                                    break;
                                }
                            }

                        }

                        if (bln_break)
                            break;
                    }
                }

                if ((this_depth.Count > 0 && depth_listClone.Count == 0) || (leaf_node = true && depth_list.Count > 0))
                {
                    bln_continue = true;
                    d_purity = 0;
                    string str_d = "";
                    int f = 0;
                    foreach (Cluster d in depth_list)
                    {
                        if (f == 0)
                            str_d = (d.clusterID).ToString();
                        else
                            str_d = str_d + ";" + d.clusterID;

                        d_purity = d_purity + d.purity;
                        f = f + 1;
                    }
                    d_purity = d_purity / depth_list.Count;
                    str_d = depth_count.ToString() + "," + str_d + "," + Math.Round((100 * depth_list.Count / Math.Pow(2, depth_count)), 2).ToString() + "," + Math.Round(d_purity, 2).ToString() + ", sub-family: " + subFamilyCount + "\r\n";
                    str_depth_purity = str_depth_purity + str_d;

                    //prepare for next depth
                    depth_list.Clear();
                    foreach (Cluster d in this_depth)
                        depth_list.Add(d);

                    depth_count = depth_count + 1;
                }


            }
            System.IO.File.WriteAllText(depth_purity_path, str_depth_purity);
            /*-----------------------------END-------------------------*/
        }
Пример #4
0
        public List<Cluster> GabrielGraph(Cluster C, List<Cluster> ListOfClusters, string[] arr_input_file, double[,] Dmatrix, string distance_type)
        {
            List<Cluster> gabrielGpList = new List<Cluster>();

            // Key, Cluster; Value, distance between two vertices or two clusters
            // c_NeighborsDict presents the gabriel graph neighbors to cluster c. 
            Dictionary<Cluster, double> c_NeighborsDict = new Dictionary<Cluster, double>();

            bool isFirstTime = true;

            foreach (Cluster Clust in ListOfClusters)
            {
                if (C.clusterID == Clust.clusterID)
                {
                    continue;
                }

                bool isNeighbor = true;

                double distance = InterClusterDistance(C, Clust, arr_input_file, Dmatrix, distance_type);

                if (isFirstTime)
                {
                    c_NeighborsDict.Add(Clust, distance);
                    isFirstTime = false;
                    continue;
                }

                Dictionary<Cluster, double> tempc_NeighborsDict = new Dictionary<Cluster, double>();
                foreach (KeyValuePair<Cluster, double> entry in c_NeighborsDict)
                {
                    tempc_NeighborsDict.Add(entry.Key, entry.Value);
                }

                foreach (KeyValuePair<Cluster, double> entry in tempc_NeighborsDict)
                {
                    if (entry.Key.clusterID == Clust.clusterID)
                    {
                        continue;
                    }

                    double candidateNeigborsDistance = InterClusterDistance(Clust, entry.Key, arr_input_file, Dmatrix, distance_type);
                    if (Math.Pow(distance, 2) + Math.Pow(candidateNeigborsDistance, 2) >= Math.Pow(entry.Value, 2))
                    {
                        isNeighbor = false;
                    }

                    if (Math.Pow(entry.Value, 2) > Math.Pow(distance, 2) + Math.Pow(candidateNeigborsDistance, 2))
                    {
                        c_NeighborsDict.Remove(entry.Key);
                    }
                }

                if (isNeighbor)
                {
                    c_NeighborsDict.Add(Clust, distance);
                }
            }

            foreach (KeyValuePair<Cluster, double> entry in c_NeighborsDict)
            {
                gabrielGpList.Add(entry.Key);
            }

            return gabrielGpList;
        }
Пример #5
0
        public double IntraClusterDistance(Cluster C, string[] arr_input_file, double[,] Dmatrix, string distance_type)
        {

            //type of distance measure: single link: singleLink, complete link:completLink, Average Link: AverageLink, median link:MedianLink or MedoidLink
            distance_type = distance_type.Trim();

            //use UPPERCASE
            distance_type = distance_type.ToUpper();

            if (distance_type != "SINGLELINK" && distance_type != "COMPLETELINK" && distance_type != "AVERAGELINK" && distance_type != "MEDOIDLINK" && distance_type != "MEDIANLINK")
                distance_type = "AVERAGELINK";

            // List<Sequence> temp_medoid = new List<Sequence>();
            Sequence OP = new Sequence();
            Hashtable h_interClusterDistance = new Hashtable();//Host the contribution of each sequence to the InterCluster distance; the key is the sequenceID

            /* for each object in  cluster C1 compute distance to all object in C2 */

            double inter_d = 0.0;

            switch (distance_type)
            {
                case "AVERAGELINK":
                    {
                        double interClusterDistance = 0.0;
                        int totalPairwiseDistances = 0;
                        // System.Console.WriteLine("C1.clusterID:" + C1.clusterID+ "C2.clusterID  :"+ C2.clusterID);
                        foreach (Sequence seq1 in C.ListOfSeq)
                        {
                            foreach (Sequence seq2 in C.ListOfSeq)
                            {
                                if (seq1.sequenceValue != seq2.sequenceValue)
                                {
                                    double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix);

                                    // System.Console.WriteLine("dD:" + d);
                                    //Console.Read();
                                    inter_d = inter_d + d;
                                    totalPairwiseDistances = totalPairwiseDistances + 1;
                                }

                            }
                        }
                        interClusterDistance = Math.Round(inter_d / totalPairwiseDistances, 3);
                        // System.Console.WriteLine("inside distance interClusterDistance:" + interClusterDistance);
                        return interClusterDistance;
                    }

                case "SINGLELINK":
                    {
                        double interClusterDistance = 0.0;
                        foreach (Sequence seq1 in C.ListOfSeq)
                        {
                            foreach (Sequence seq2 in C.ListOfSeq)
                            {
                                if (seq1.sequenceValue != seq2.sequenceValue)
                                {
                                    double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix);
                                    if (d < inter_d)
                                        inter_d = d;
                                }
                            }
                        }
                        interClusterDistance = inter_d;
                        return interClusterDistance;
                    }

                case "COMPLETELINK":
                    {
                        double interClusterDistance = 0.0;
                        foreach (Sequence seq1 in C.ListOfSeq)
                        {
                            foreach (Sequence seq2 in C.ListOfSeq)
                            {
                                if (seq1.sequenceValue != seq2.sequenceValue)
                                {
                                    double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix);
                                    if (d > inter_d)
                                        inter_d = d;
                                }
                            }
                        }
                        interClusterDistance = inter_d;
                        return interClusterDistance;
                    }

                case "MEDOIDLINK":
                    {
                        double interClusterDistance = 0.0;
                        //there could be many medoids for a cluster. We select the two with the largest distance
                        foreach (Sequence seq1 in C.listOfMedoid)
                        {
                            foreach (Sequence seq2 in C.listOfMedoid)
                            {
                                if (seq1.sequenceValue != seq2.sequenceValue)
                                {
                                    double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix);
                                    if (d > inter_d)
                                        inter_d = d;
                                }
                            }
                        }

                        interClusterDistance = inter_d;
                        return interClusterDistance;
                    }
                default:
                    return inter_d;

            }//switch


            //return interClusterDistance;
        }
Пример #6
0
        public List<Sequence> ComputeMedoid(Cluster C, string[] arr_input_file, double[,] Dmatrix)
        {
            List<Sequence> temp_medoid = new List<Sequence>();
            Sequence OP = new Sequence();
            Hashtable h_intraClusterDistance = new Hashtable();//Host the contribution of each sequence to the intraCluster distance; the key is the sequenceID
            // Hashtable h_sequence = new Hashtable();//Host the contribution of each sequence to the intraCluster distance; the key is the sequenceID

            /* for each object in this cluster select the object in middle by computing the distance of each object to all objects*/
            double intraCluster = 0.0;
            // int numOfSeq=C.ListOfSeq.Count;
            // double[] arr_intraClusterD = new double[numOfSeq];

            foreach (Sequence seq1 in C.ListOfSeq)
            {
                intraCluster = 0;
                foreach (Sequence seq2 in C.ListOfSeq)
                {
                    if (seq1.sequenceValue != seq2.sequenceValue)
                    {
                        double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix);
                        intraCluster = intraCluster + d;
                    }
                }
                //The contribution of each sequ is in the hash.
                seq1.contributionToIntraClusterDistance = intraCluster;
                h_intraClusterDistance.Add(seq1.sequenceValue, seq1);
            }

            // The squence with the minimum distance is the medoid
            double minDistance = -1.0;
            bool first_time = true;
            foreach (DictionaryEntry entry in h_intraClusterDistance)
            {
                Sequence temp = (Sequence)(entry.Value);
                if (first_time)
                    minDistance = temp.contributionToIntraClusterDistance;
                if (temp.contributionToIntraClusterDistance < minDistance)
                    minDistance = temp.contributionToIntraClusterDistance;
            }

            // At this point the minimumn value of intra cluster distance is known. What is needed to find out all the sequences with same value
            foreach (DictionaryEntry entry in h_intraClusterDistance)
            {
                Sequence temp = (Sequence)(entry.Value);
                if (temp.contributionToIntraClusterDistance == minDistance)
                    temp_medoid.Add(temp);
            }

            return temp_medoid;
        }
Пример #7
0
        public List<Cluster> OneNN(Cluster C, List<Cluster> ListOfClusters, string[] arr_input_file, double[,] Dmatrix, string distance_type)
        {
            List<Cluster> ListOfNN = new List<Cluster>();
            Cluster temp_clust = new Cluster();
            double minDistance = 0.0;
            Boolean first_time = true;
            foreach (Cluster Clust in ListOfClusters)
            {
                if (C.clusterID != Clust.clusterID)
                {
                    double d = InterClusterDistance(C, Clust, arr_input_file, Dmatrix, distance_type);
                    if (first_time)
                    {
                        first_time = false;
                        minDistance = d;
                    }
                    else
                    {
                        if (d < minDistance)
                        {
                            minDistance = d;
                            temp_clust = Clust;
                        }
                    }
                }
            }
            /* Find all the cluster such that d(C,Ci)=d*/
            foreach (Cluster Clust in ListOfClusters)
            {
                double d = InterClusterDistance(C, Clust, arr_input_file, Dmatrix, distance_type);
                if (first_time)
                {
                    first_time = false;
                    minDistance = d;
                }
                else
                {
                    if (d == minDistance)
                        ListOfNN.Add(Clust);
                }
            }
            return ListOfNN;

        }