public Cluster Merge(Cluster C1, Cluster C2, string[] arr_label, double distanceThreshold, string[] arr_input_file, double[,] Dmatrix, string distance_type) { if (C1 == null || C2 == null) { return null; } if (InterClusterDistance(C1, C2, arr_input_file, Dmatrix, distance_type) >= distanceThreshold) { return null; } Cluster C = new Cluster(); foreach (Sequence list in C1.ListOfSeq) { C.ListOfSeq.Add(list); } foreach (Sequence list in C2.ListOfSeq) { C.ListOfSeq.Add(list); } C.arr_class_proportion = C.computeClassProportion(C.ListOfSeq, arr_label); C.purity = C.computePurity(C.arr_class_proportion); C.majority_class = C.getMajorityClass(arr_label); //System.Console.WriteLine("MergedclusterID" + C.clusterID + "\r\n -->cluster 1ID: " + C1.clusterID + "-->cluster 2ID:" + C2.clusterID + " \r\n merged majority class-->" + C.majority_class); //Console.Read(); return C; }
public List<Cluster> GabrielNeighbor(Cluster C, List<Cluster> ListOfClusters, string[] arr_input_file, double[,] Dmatrix, string distance_type) { /*given a cluster C of one element, this function return all the G-neighbor of C */ List<Cluster> gabrielGpList = new List<Cluster>(); foreach (Cluster Cnbr in ListOfClusters) { if (C.clusterID == Cnbr.clusterID) continue; bool isNeighbor = true; //double distance = InterClusterDistance(C, Clust, arr_input_file, Dmatrix, distance_type); double distance_C_Cnbr = InterClusterDistance(C, Cnbr, arr_input_file, Dmatrix, distance_type); foreach (Cluster Ci in ListOfClusters) { if (Cnbr.clusterID != Ci.clusterID && C.clusterID != Ci.clusterID) { double distance_C_Ci = InterClusterDistance(C, Ci, arr_input_file, Dmatrix, distance_type); double distance_Ci_Cnbr = InterClusterDistance(Ci, Cnbr, arr_input_file, Dmatrix, distance_type); if (Math.Pow(distance_C_Cnbr, 2) >= Math.Pow(distance_C_Ci, 2) + Math.Pow(distance_Ci_Cnbr, 2)) { isNeighbor = false; break; } } } if (isNeighbor == true) { gabrielGpList.Add(Cnbr); } } return gabrielGpList; }
static void Main(string[] args) { /*Step1--------------BEGIN Import Distance Matrix and Dataset-------------------------------------*/ //args[0]: Path to input dataset //args[1]: Path to similarity Matrix //--------------PRUNNING purity threshold ------------------- double thetha = 85; //---------------------------------------------------------- string input_file_path; string pruned_tree_path; string newick_path; string ouput_file_path; string distance_matix_path; int iteration; System.Console.WriteLine(DateTime.Now.ToString("hh:mm:ss tt")); string distance_type = "AVERAGELINK"; string graphType = "GabrielGraph"; //or set as "OneNN" string str_dendogram = "iterationID,merdeCluster iterationID,clusterID,purity,child1ID,child2ID,d(child1;Parent),d(child2;Parent),intraD(mergedcluster)"; int cluster_count = 0; if (args.Length > 0) input_file_path = args[0]; else { //input_file_path = "C:\\Users\\paul\\Desktop\\STAXAC\\inputData\\TEST\\test_file1.csv"; // input_file_path = "C:\\Users\\paul\\Desktop\\NJ\\inputData\\BFE\\m_BFE_file.csv"; input_file_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\m_BFH_file.csv"; } if (args.Length > 1) distance_matix_path = args[1]; else { //distance_matix_path = "C:\\Users\\paul\\Desktop\\STAXAC\\inputData\\TEST\\m_test1.csv"; //distance_matix_path = "C:\\Users\\paul\\Desktop\\NJ\\inputData\\BFE\\m_BFE.csv"; distance_matix_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\m_BFH.csv"; } if (args.Length > 2) ouput_file_path = args[2]; else { //ouput_file_path = "C:\\Users\\paul\\Desktop\\STAXAC\\inputData\\TEST\\m_test1_output3.csv"; ouput_file_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\m_BFH_output.csv"; } // pruned_tree_path = "C:\\Users\\paul\\Desktop\\STAXAC\\inputData\\TEST\\str_editedTree_" + thetha + "3.csv"; pruned_tree_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\str_editedTree_" + thetha + "3.csv"; newick_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\newick_" + thetha + "3.csv"; string depth_purity_path = "C:\\Users\\chong\\Desktop\\PhD\\inputData\\BFH\\depth_purity_" + thetha + "3.csv"; /* Import dataset */ string[] arr_input_file = System.IO.File.ReadAllLines(input_file_path); string[] arr_distance_matrix = System.IO.File.ReadAllLines(distance_matix_path); int root_Data_size = arr_input_file.Length; int distance_matrix_size = arr_distance_matrix.Length; double distanceThreshold = 1.0; //default set distance threshold equal to one Sequence[] matrixSequence = new Sequence[root_Data_size]; //used as temporary variable for convenience Cluster[] arr_mergedCluster = new Cluster[arr_input_file.Length - 1];//contains the merged cluster Cluster[] tree = new Cluster[2 * arr_input_file.Length - 1];//contains ALL the clusters bottom up; initial clusters + merged clusters; this means contain the tree //double checking if input is correct string error_msg; error_msg = "Run successfully"; if (root_Data_size != distance_matrix_size) { error_msg = "ERROR 001: Input file and distance matrix do not match in size"; System.Console.WriteLine(error_msg); Console.Read(); System.Environment.Exit(0); } /*---------------END Import Distance Matrix and Dataset----------------------------------------------*/ /*STEP2:-----------Fill matrices---------------------------------------------------------------------*/ //label vector for each line string[] arr_line_label = new string[arr_input_file.Length]; string[] arr_species = new string[arr_input_file.Length]; string str = ""; for (int i = 0; i < arr_input_file.Length; i++) { string[] arr_temp = arr_input_file[i].Split(new[] { ',' }); if (str == "") { str = arr_temp[arr_temp.Length - 1]; } else { Boolean add_to_list = true; foreach (string label in arr_line_label) { if (label == arr_temp[arr_temp.Length - 1]) add_to_list = false; } if (add_to_list) str = str + ";" + arr_temp[arr_temp.Length - 1]; } arr_species[i] = arr_temp[0] + " / " + arr_temp[arr_temp.Length - 1]; arr_line_label[i] = arr_temp[arr_temp.Length - 1]; } //get label vector. Distinct label values string[] arr_label = str.Split(new[] { ';' }); double[,] Dmatrix = new double[distance_matrix_size, distance_matrix_size]; for (int i = 0; i < distance_matrix_size; i++) { string[] arr_temp = arr_distance_matrix[i].Split(new[] { ',' }); for (int j = 0; j < arr_temp.Length; j++) { Dmatrix[i, j] = Convert.ToDouble(arr_temp[j]); } } /* STEP3:-----CREATE OBJECTS---------------------------------------------*/ //- create sequences and initial sequence matrix for (int i = 0; i < arr_input_file.Length; i++) { //arr_input_file[i] Sequence s = new Sequence(); s.sequencePos = i; s.sequenceValue = arr_input_file[i];//including the class label. need to separate them later s.label = arr_line_label[i]; if (s != null) { matrixSequence[i] = s; } } iteration = -1; //- Create initial Clusters-Start with a clustering X of one-object clusters ClusterOperation O = new ClusterOperation(); List<Cluster> ListOfClusters = new List<Cluster>(); for (int i = 0; i < matrixSequence.Length; i++) { Cluster c = new Cluster(); List<Sequence> ListOfSeq = new List<Sequence>(); ListOfSeq.Add(matrixSequence[i]);//list of one element initially c.ListOfSeq.Add(matrixSequence[i]); cluster_count = cluster_count + 1; c.clusterID = cluster_count; c.intraClusterDistance = 0.0; c.listOfMedoid.Add(matrixSequence[i]); c.arr_class_proportion = c.computeClassProportion(ListOfSeq, arr_label); c.purity = c.computePurity(c.arr_class_proportion); c.child1ID = -1; c.child2ID = -1; c.label = arr_line_label[i]; c.majority_class = arr_line_label[i]; ListOfClusters.Add(c); iteration = iteration + 1; tree[iteration] = c; // add to tree //for flat file only str_dendogram = str_dendogram + "\r\n" + cluster_count + "," + "0" + "," + c.clusterID + "," + c.purity + "," + c.child1ID + "," + c.child2ID + ",-1" + ",-1"; } /*-----------------------------------------------------------------*/ //at this point we have all leaf-cluster. Each cluster has only one element /*Step 2: next we need to find the Gabriel Neighbors of each 1-exmple cluster */ for (int i = 0; i < ListOfClusters.Count; i++) { // System.Console.WriteLine("xxx before: " + DateTime.Now.ToString("hh:mm:ss tt") + " " + i); if (graphType == "OneNN") { ListOfClusters[i].OneNN = O.OneNN(ListOfClusters[i], ListOfClusters, arr_input_file, Dmatrix, distance_type);//contains the 1NN of cluster } else if (graphType == "GabrielGraph") { ListOfClusters[i].GN = O.GabrielNeighbor(ListOfClusters[i], ListOfClusters, arr_input_file, Dmatrix, distance_type);//contains the gabriel graph cluster } } foreach (Cluster cluster in ListOfClusters) { System.Console.WriteLine("\r\n \r\n clusterID: " + cluster.clusterID); System.Console.WriteLine("===One Nearest Neighbors List====>"); for (int j = 0; j < cluster.GN.Count; j++) System.Console.Write("cluster.GN[j]:" + cluster.GN[j].clusterID + ":"); } System.Console.WriteLine(""); Console.Read(); /* * Merge-Candidate(C’,C)that is (1NNX(C’)=C or 1NNX(C )=C’) */ int mergedC_iteration = -1; //System.Console.WriteLine("ListOfClusters.Count -->" + ListOfClusters.Count); int NUM_OF_STEPS = ListOfClusters.Count; while (ListOfClusters.Count > 1) { Cluster[] best_GN_cluster = new Cluster[ListOfClusters.Count]; double[] best_GN_mergedPurity = new double[ListOfClusters.Count]; double[] best_GN_distance = new double[ListOfClusters.Count]; /* merge a cluster and its closest G-neighbor and compute impurity growth. * Retain the merge that yields lowest impurity growth */ for (int i = 0; i < ListOfClusters.Count; i++) { double temp_max_purity = -1.0; foreach (Cluster g_neighor in ListOfClusters[i].GN) { Cluster C = O.Merge(ListOfClusters[i], g_neighor, arr_label, distanceThreshold, arr_input_file, Dmatrix, distance_type);//this operation creates a new cluster if (C != null && (temp_max_purity < C.purity)) { best_GN_mergedPurity[i] = C.purity; best_GN_cluster[i] = g_neighor;//the best 1NN is here temp_max_purity = C.purity; //best_1NN_distance[i] = O.InterClusterDistance(ListOfClusters[i], oneNNClist, arr_input_file, Dmatrix, distance_type); } //System.Console.WriteLine(ListOfClusters[i].clusterID + ":" + oneNNClist.clusterID + " : Merge purity: " + C.purity); } } //now we need to select the best cluster to merge double max_purity = -1.0; int best_i = -1; for (int i = 0; i < ListOfClusters.Count; i++) { if (max_purity < best_GN_mergedPurity[i]) { max_purity = best_GN_mergedPurity[i]; best_i = i; } } //we need to construct a new ListOfClusters excluding ListOfClusters[best_i], and best_GN_cluster[best_i] List<Cluster> ListOfClustersClone = new List<Cluster>(); for (int i = 0; i < ListOfClusters.Count; i++) { ListOfClustersClone.Add(ListOfClusters[i]); } Cluster best_merged_Cluster = new Cluster(); //best_merged_Cluster = ListOfClusters[best_i]; best_merged_Cluster = O.Merge(ListOfClustersClone[best_i], best_GN_cluster[best_i], arr_label, distanceThreshold, arr_input_file, Dmatrix, distance_type);//this operation create a new cluster cluster_count = cluster_count + 1; best_merged_Cluster.clusterID = cluster_count; best_merged_Cluster.child1ID = ListOfClustersClone[best_i].clusterID; best_merged_Cluster.child2ID = best_GN_cluster[best_i].clusterID; best_merged_Cluster.arr_class_proportion = best_merged_Cluster.computeClassProportion(best_merged_Cluster.ListOfSeq, arr_label); best_merged_Cluster.purity = best_merged_Cluster.computePurity(best_merged_Cluster.arr_class_proportion); ListOfClusters.Remove(ListOfClustersClone[best_i]); ListOfClusters.Remove(best_GN_cluster[best_i]); if (best_merged_Cluster.GN.Count == 0) { if (graphType == "OneNN") { best_merged_Cluster.OneNN = O.OneNN(best_merged_Cluster, ListOfClusters, arr_input_file, Dmatrix, distance_type); } else if (graphType == "GabrielGraph") { best_merged_Cluster.GN = O.GabrielGraph(best_merged_Cluster, ListOfClusters, arr_input_file, Dmatrix, distance_type); } } //update 1NN list for (int i = 0; i < ListOfClusters.Count; i++) { for (int j = 0; j < ListOfClusters[i].GN.Count; j++) { if (ListOfClusters[i].GN[j].clusterID == ListOfClustersClone[best_i].clusterID || ListOfClusters[i].GN[j].clusterID == best_GN_cluster[best_i].clusterID) ListOfClusters[i].GN[j] = best_merged_Cluster; } } ListOfClusters.Add(best_merged_Cluster); // add this to tree iteration = iteration + 1; tree[iteration] = best_merged_Cluster; mergedC_iteration = mergedC_iteration + 1; arr_mergedCluster[mergedC_iteration] = best_merged_Cluster; // this does not contain the singleton clusters. It only contain the merged clusters } System.Console.WriteLine(DateTime.Now.ToString("hh:mm:ss tt")); System.Console.WriteLine(""); System.Console.WriteLine("--------------END-----------------"); Console.Read(); string[] arr_Newick = new string[arr_line_label.Length + arr_mergedCluster.Length]; for (int i = 0; i < arr_line_label.Length; i++) { arr_Newick[i] = arr_species[i]; } int int_count = arr_line_label.Length - 1; for (int iter = 0; iter < arr_mergedCluster.Length; iter++) { //temp_count = temp_count + 1; string child1 = ""; string purity1 = ""; double d1 = 0.0; double intra_d = 0.0;//average intra cluster distance int child_clustr_indice = 0; if (arr_mergedCluster[iter].child1ID < arr_line_label.Length) { // child1=(arr_line_label[arr_mergedCluster[iter].child1ID -1]).ToString(); child1 = (arr_species[arr_mergedCluster[iter].child1ID - 1]).ToString(); purity1 = "1.0"; } else { child1 = arr_Newick[arr_mergedCluster[iter].child1ID - 1]; child_clustr_indice = arr_mergedCluster[iter].child1ID - 1; purity1 = (Math.Round(tree[child_clustr_indice].purity, 3)).ToString(); } //ClusterOperation O = new ClusterOperation(); //intra cluster distance // intra_d = O.IntraClusterDistance(arr_mergedCluster[iter], arr_input_file, Dmatrix, distance_type); d1 = O.InterClusterDistance(arr_mergedCluster[iter], tree[arr_mergedCluster[iter].child1ID - 1], arr_input_file, Dmatrix, distance_type); d1 = Math.Round(d1, 3); string child2 = ""; string purity2 = ""; double d2 = 0.0; if (arr_mergedCluster[iter].child2ID < arr_line_label.Length) { // child2 = (arr_line_label[arr_mergedCluster[iter].child2ID-1]).ToString(); child2 = (arr_species[arr_mergedCluster[iter].child2ID - 1]).ToString(); purity2 = "1.0"; } else { child2 = arr_Newick[arr_mergedCluster[iter].child2ID - 1]; child_clustr_indice = arr_mergedCluster[iter].child2ID - 1; purity2 = (Math.Round(tree[child_clustr_indice].purity, 3)).ToString(); } d2 = O.InterClusterDistance(arr_mergedCluster[iter], tree[arr_mergedCluster[iter].child2ID - 1], arr_input_file, Dmatrix, distance_type); d2 = Math.Round(d2, 3); int_count = int_count + 1; string thisClusterPurity = ""; thisClusterPurity = (Math.Round(arr_mergedCluster[iter].purity, 3)).ToString(); arr_Newick[int_count] = "(" + child1 + ":" + d1 + "," + child2 + ":" + d2 + ")P=" + thisClusterPurity; str_dendogram = str_dendogram + "\r\n" + (iter + arr_line_label.Length + 1) + "," + arr_mergedCluster[iter].clusterID + "," + arr_mergedCluster[iter].clusterID + "," + arr_mergedCluster[iter].purity + "," + arr_mergedCluster[iter].child1ID + "," + arr_mergedCluster[iter].child2ID + "," + d1 + "," + d2 + "," + intra_d; System.IO.File.WriteAllText(ouput_file_path, str_dendogram); } string str_Newick = ""; string str_Newick_temp = ""; bool first_time = true; for (int iter = arr_line_label.Length; iter < arr_Newick.Length; iter++) { if (first_time) { str_Newick = str_Newick + arr_Newick[iter]; str_Newick_temp = "\"" + arr_Newick[iter] + ";\""; first_time = false; } else { if (iter == (arr_Newick.Length - 1)) str_Newick = "\"" + arr_Newick[iter] + ";\""; else str_Newick_temp = "\"" + arr_Newick[iter] + ";\""; } } System.IO.File.WriteAllText(newick_path, str_Newick); //postprocessing; pruning. This is based on user-defined purity threshold thetha. CLone the tree Cluster[] tree_temp = new Cluster[2 * arr_input_file.Length - 1];//contains ALL the clusters bottom up; initial clusters + merged clusters; this means contain the tree for (int iter = 0; iter < tree_temp.Length; iter++) { Cluster new_c = new Cluster(); new_c.child1ID = tree[iter].child1ID; new_c.child2ID = tree[iter].child2ID; new_c.ListOfSeq = tree[iter].ListOfSeq; new_c.clusterID = tree[iter].clusterID; new_c.intraClusterDistance = tree[iter].intraClusterDistance; new_c.label = tree[iter].label; new_c.listOfMedoid = tree[iter].listOfMedoid; new_c.node_type = tree[iter].node_type; new_c.GN = tree[iter].GN; new_c.purity = tree[iter].purity; new_c.arr_class_proportion = tree[iter].arr_class_proportion; new_c.majority_class = tree[iter].majority_class; new_c.in_reduced_tree = "N"; tree_temp[iter] = new_c; //make a copy } for (int iter = arr_line_label.Length; iter < tree_temp.Length; iter++) { // the leaf-nodes have child1ID=-1 and child2ID=-1 if (100 * tree_temp[iter].purity < thetha)//then the children are leaf-nodes. { if (tree_temp[iter].in_reduced_tree != "Y") { tree_temp[iter].in_reduced_tree = "Y"; System.Console.WriteLine("tree_temp[iter].purity: " + tree_temp[iter].purity); System.Console.WriteLine("tree_temp[tree_temp[iter].child1ID - 1]: " + tree_temp[tree_temp[iter].child1ID - 1].purity + " nodeid: " + tree_temp[tree_temp[iter].child1ID - 1].clusterID); System.Console.WriteLine("tree_temp[tree_temp[iter].child2ID - 1]: " + tree_temp[tree_temp[iter].child2ID - 1].purity + " nodeid: " + tree_temp[tree_temp[iter].child2ID - 1].clusterID); if (100 * tree_temp[tree_temp[iter].child1ID - 1].purity >= thetha || 100 * tree_temp[tree_temp[iter].child2ID - 1].purity >= thetha) { tree_temp[tree_temp[iter].child1ID - 1].node_type = "L"; tree_temp[tree_temp[iter].child2ID - 1].node_type = "L"; // tree_temp[tree_temp[iter].child1ID - 1].child1ID = -1; // tree_temp[tree_temp[iter].child1ID - 1].child2ID = -1; tree_temp[tree_temp[iter].child1ID - 1].in_reduced_tree = "Y"; tree_temp[tree_temp[iter].child2ID - 1].in_reduced_tree = "Y"; } } } } int count_reduced_node = 0; string str_reduced_node = ""; first_time = true; for (int iter = 0; iter < tree_temp.Length; iter++) { if (tree_temp[iter].in_reduced_tree == "Y") { count_reduced_node = count_reduced_node + 1; if (first_time) { str_reduced_node = tree_temp[iter].clusterID.ToString(); first_time = false; } else str_reduced_node = str_reduced_node + "\r\n" + tree_temp[iter].clusterID.ToString(); } } System.IO.File.WriteAllText(pruned_tree_path, str_reduced_node); /*------------END----------------*/ /* ------Purity of each depth-----------------------*/ List<Cluster> treeClone11 = new List<Cluster>(); List<Cluster> treeClone12 = new List<Cluster>(); // foreach (Node node in tree) for (int iter = 0; iter < tree.Length; iter++) { Cluster new_c = new Cluster(); new_c.child1ID = tree[iter].child1ID; new_c.child2ID = tree[iter].child2ID; new_c.ListOfSeq = tree[iter].ListOfSeq; new_c.clusterID = tree[iter].clusterID; new_c.intraClusterDistance = tree[iter].intraClusterDistance; new_c.label = tree[iter].label; new_c.listOfMedoid = tree[iter].listOfMedoid; new_c.node_type = tree[iter].node_type; new_c.GN = tree[iter].GN; new_c.purity = tree[iter].purity; new_c.arr_class_proportion = tree[iter].arr_class_proportion; new_c.majority_class = tree[iter].majority_class; for (int i = 0; i < tree_temp.Length; i++) { if (tree[iter].clusterID == tree_temp[i].child1ID || tree[iter].clusterID == tree_temp[i].child2ID) { new_c.parentID = tree_temp[i].clusterID; break; } } treeClone11.Add(new_c); //make a copy } string str_debug = "nodeID,parentID,Purity" + "\r\n"; for (int iter = 0; iter < treeClone11.Count; iter++) { treeClone12.Add(tree[iter]); System.Console.WriteLine("\r\n treeClone11[i]: " + treeClone11[iter].clusterID + " parent :" + treeClone11[iter].parentID + " cluster purity :" + treeClone11[iter].purity); str_debug = str_debug + treeClone11[iter].clusterID + "," + treeClone11[iter].parentID + "," + treeClone11[iter].purity + "\r\n"; } bool bln_continue = true; List<Cluster> depth_list = new List<Cluster>(); List<Cluster> depth_listClone = new List<Cluster>(); List<double> depth_purity = new List<Double>(); double d_purity = -1; int test = treeClone11[treeClone11.Count - 1].clusterID; int testP = treeClone11[treeClone11.Count - 1].parentID; depth_list.Add(treeClone11[treeClone11.Count - 1]); depth_listClone.Add(treeClone11[treeClone11.Count - 1]); depth_purity.Add(treeClone11[treeClone11.Count - 2].purity); int depth_count = 0; string str_depth_purity = "Depth#,NodeIDs,%Node, Purity" + "\r\n"; bool leaf_node = true; int subFamilyCount = 0; while (bln_continue) { bln_continue = false; List<Cluster> this_depth = new List<Cluster>(); foreach (Cluster parent in depth_list) { int k = 0; bool bln_break = false; leaf_node = true; for (int i = treeClone11.Count - 1; i >= 0; i--) //nodeID { if (treeClone11[i].parentID == parent.clusterID) { leaf_node = false; k = k + 1; this_depth.Add(treeClone11[i]); if ((treeClone11[i].majority_class).Trim() != (parent.majority_class).Trim()) { subFamilyCount = subFamilyCount + 1; } if (k == 2) { k = 0; if (depth_listClone.Count > 0) depth_listClone.Remove(parent); if (depth_listClone.Count == 0) { bln_break = true; break; } } } if (bln_break) break; } } if ((this_depth.Count > 0 && depth_listClone.Count == 0) || (leaf_node = true && depth_list.Count > 0)) { bln_continue = true; d_purity = 0; string str_d = ""; int f = 0; foreach (Cluster d in depth_list) { if (f == 0) str_d = (d.clusterID).ToString(); else str_d = str_d + ";" + d.clusterID; d_purity = d_purity + d.purity; f = f + 1; } d_purity = d_purity / depth_list.Count; str_d = depth_count.ToString() + "," + str_d + "," + Math.Round((100 * depth_list.Count / Math.Pow(2, depth_count)), 2).ToString() + "," + Math.Round(d_purity, 2).ToString() + ", sub-family: " + subFamilyCount + "\r\n"; str_depth_purity = str_depth_purity + str_d; //prepare for next depth depth_list.Clear(); foreach (Cluster d in this_depth) depth_list.Add(d); depth_count = depth_count + 1; } } System.IO.File.WriteAllText(depth_purity_path, str_depth_purity); /*-----------------------------END-------------------------*/ }
public List<Cluster> GabrielGraph(Cluster C, List<Cluster> ListOfClusters, string[] arr_input_file, double[,] Dmatrix, string distance_type) { List<Cluster> gabrielGpList = new List<Cluster>(); // Key, Cluster; Value, distance between two vertices or two clusters // c_NeighborsDict presents the gabriel graph neighbors to cluster c. Dictionary<Cluster, double> c_NeighborsDict = new Dictionary<Cluster, double>(); bool isFirstTime = true; foreach (Cluster Clust in ListOfClusters) { if (C.clusterID == Clust.clusterID) { continue; } bool isNeighbor = true; double distance = InterClusterDistance(C, Clust, arr_input_file, Dmatrix, distance_type); if (isFirstTime) { c_NeighborsDict.Add(Clust, distance); isFirstTime = false; continue; } Dictionary<Cluster, double> tempc_NeighborsDict = new Dictionary<Cluster, double>(); foreach (KeyValuePair<Cluster, double> entry in c_NeighborsDict) { tempc_NeighborsDict.Add(entry.Key, entry.Value); } foreach (KeyValuePair<Cluster, double> entry in tempc_NeighborsDict) { if (entry.Key.clusterID == Clust.clusterID) { continue; } double candidateNeigborsDistance = InterClusterDistance(Clust, entry.Key, arr_input_file, Dmatrix, distance_type); if (Math.Pow(distance, 2) + Math.Pow(candidateNeigborsDistance, 2) >= Math.Pow(entry.Value, 2)) { isNeighbor = false; } if (Math.Pow(entry.Value, 2) > Math.Pow(distance, 2) + Math.Pow(candidateNeigborsDistance, 2)) { c_NeighborsDict.Remove(entry.Key); } } if (isNeighbor) { c_NeighborsDict.Add(Clust, distance); } } foreach (KeyValuePair<Cluster, double> entry in c_NeighborsDict) { gabrielGpList.Add(entry.Key); } return gabrielGpList; }
public double IntraClusterDistance(Cluster C, string[] arr_input_file, double[,] Dmatrix, string distance_type) { //type of distance measure: single link: singleLink, complete link:completLink, Average Link: AverageLink, median link:MedianLink or MedoidLink distance_type = distance_type.Trim(); //use UPPERCASE distance_type = distance_type.ToUpper(); if (distance_type != "SINGLELINK" && distance_type != "COMPLETELINK" && distance_type != "AVERAGELINK" && distance_type != "MEDOIDLINK" && distance_type != "MEDIANLINK") distance_type = "AVERAGELINK"; // List<Sequence> temp_medoid = new List<Sequence>(); Sequence OP = new Sequence(); Hashtable h_interClusterDistance = new Hashtable();//Host the contribution of each sequence to the InterCluster distance; the key is the sequenceID /* for each object in cluster C1 compute distance to all object in C2 */ double inter_d = 0.0; switch (distance_type) { case "AVERAGELINK": { double interClusterDistance = 0.0; int totalPairwiseDistances = 0; // System.Console.WriteLine("C1.clusterID:" + C1.clusterID+ "C2.clusterID :"+ C2.clusterID); foreach (Sequence seq1 in C.ListOfSeq) { foreach (Sequence seq2 in C.ListOfSeq) { if (seq1.sequenceValue != seq2.sequenceValue) { double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix); // System.Console.WriteLine("dD:" + d); //Console.Read(); inter_d = inter_d + d; totalPairwiseDistances = totalPairwiseDistances + 1; } } } interClusterDistance = Math.Round(inter_d / totalPairwiseDistances, 3); // System.Console.WriteLine("inside distance interClusterDistance:" + interClusterDistance); return interClusterDistance; } case "SINGLELINK": { double interClusterDistance = 0.0; foreach (Sequence seq1 in C.ListOfSeq) { foreach (Sequence seq2 in C.ListOfSeq) { if (seq1.sequenceValue != seq2.sequenceValue) { double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix); if (d < inter_d) inter_d = d; } } } interClusterDistance = inter_d; return interClusterDistance; } case "COMPLETELINK": { double interClusterDistance = 0.0; foreach (Sequence seq1 in C.ListOfSeq) { foreach (Sequence seq2 in C.ListOfSeq) { if (seq1.sequenceValue != seq2.sequenceValue) { double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix); if (d > inter_d) inter_d = d; } } } interClusterDistance = inter_d; return interClusterDistance; } case "MEDOIDLINK": { double interClusterDistance = 0.0; //there could be many medoids for a cluster. We select the two with the largest distance foreach (Sequence seq1 in C.listOfMedoid) { foreach (Sequence seq2 in C.listOfMedoid) { if (seq1.sequenceValue != seq2.sequenceValue) { double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix); if (d > inter_d) inter_d = d; } } } interClusterDistance = inter_d; return interClusterDistance; } default: return inter_d; }//switch //return interClusterDistance; }
public List<Sequence> ComputeMedoid(Cluster C, string[] arr_input_file, double[,] Dmatrix) { List<Sequence> temp_medoid = new List<Sequence>(); Sequence OP = new Sequence(); Hashtable h_intraClusterDistance = new Hashtable();//Host the contribution of each sequence to the intraCluster distance; the key is the sequenceID // Hashtable h_sequence = new Hashtable();//Host the contribution of each sequence to the intraCluster distance; the key is the sequenceID /* for each object in this cluster select the object in middle by computing the distance of each object to all objects*/ double intraCluster = 0.0; // int numOfSeq=C.ListOfSeq.Count; // double[] arr_intraClusterD = new double[numOfSeq]; foreach (Sequence seq1 in C.ListOfSeq) { intraCluster = 0; foreach (Sequence seq2 in C.ListOfSeq) { if (seq1.sequenceValue != seq2.sequenceValue) { double d = OP.fetchDistance(seq1, seq2, arr_input_file, Dmatrix); intraCluster = intraCluster + d; } } //The contribution of each sequ is in the hash. seq1.contributionToIntraClusterDistance = intraCluster; h_intraClusterDistance.Add(seq1.sequenceValue, seq1); } // The squence with the minimum distance is the medoid double minDistance = -1.0; bool first_time = true; foreach (DictionaryEntry entry in h_intraClusterDistance) { Sequence temp = (Sequence)(entry.Value); if (first_time) minDistance = temp.contributionToIntraClusterDistance; if (temp.contributionToIntraClusterDistance < minDistance) minDistance = temp.contributionToIntraClusterDistance; } // At this point the minimumn value of intra cluster distance is known. What is needed to find out all the sequences with same value foreach (DictionaryEntry entry in h_intraClusterDistance) { Sequence temp = (Sequence)(entry.Value); if (temp.contributionToIntraClusterDistance == minDistance) temp_medoid.Add(temp); } return temp_medoid; }
public List<Cluster> OneNN(Cluster C, List<Cluster> ListOfClusters, string[] arr_input_file, double[,] Dmatrix, string distance_type) { List<Cluster> ListOfNN = new List<Cluster>(); Cluster temp_clust = new Cluster(); double minDistance = 0.0; Boolean first_time = true; foreach (Cluster Clust in ListOfClusters) { if (C.clusterID != Clust.clusterID) { double d = InterClusterDistance(C, Clust, arr_input_file, Dmatrix, distance_type); if (first_time) { first_time = false; minDistance = d; } else { if (d < minDistance) { minDistance = d; temp_clust = Clust; } } } } /* Find all the cluster such that d(C,Ci)=d*/ foreach (Cluster Clust in ListOfClusters) { double d = InterClusterDistance(C, Clust, arr_input_file, Dmatrix, distance_type); if (first_time) { first_time = false; minDistance = d; } else { if (d == minDistance) ListOfNN.Add(Clust); } } return ListOfNN; }