/// <summary> /// Helper function for ModelDeviation. Outputs /// estimates of average cluster deviation. /// </summary> protected double ClusterDeviation(List<SegmentInfo> segments, int numClusters, double tempDeviation, out int heterogeneousClusters, out double heterogeneityIndex, bool bestModel, string debugPathClusterInfo = null) { // compute average deviation for each cluster (clusterDeviation) List<ClusterModel> clusterDeviations = new List<ClusterModel>(); for (int clusterID = 0; clusterID < numClusters; clusterID++) { ClusterModel clusterInfo = new ClusterModel(); clusterInfo.ClusterID = clusterID + 1; clusterDeviations.Add(clusterInfo); } // populate clusters foreach (SegmentInfo info in segments) { if (info.Cluster.Value != -1 && info.MAF != -1) clusterDeviations[info.Cluster.Value - 1].ClusterDistances.Add(Tuple.Create(Convert.ToSingle(info.Distance), Convert.ToSingle(info.Weight))); } // compute cluster mean and standard deviation for (int clusterID = 0; clusterID < numClusters; clusterID++) { clusterDeviations[clusterID].ClusterMedianDistance = Convert.ToDouble(CanvasCommon.Utilities.WeightedMedian(clusterDeviations[clusterID].ClusterDistances)); clusterDeviations[clusterID].ClusterDistanceIQR = Convert.ToDouble(CanvasCommon.Utilities.WeightedIQR(clusterDeviations[clusterID].ClusterDistances)); } // exlcude clusters with deviation larger than 1.5 of average deviation // these clusters locate far from expected model centroids and most likely represent segments coming from heterogeneous variants double clusterDeviation = 0; List<double> heterogeneousClusterID = new List<double>(); for (int i = 0; i < numClusters; i++) { foreach (ClusterModel clusterInfo in clusterDeviations) if (clusterInfo.ClusterMedianDistance < tempDeviation * 1.5f) clusterDeviation += clusterInfo.ClusterMedianDistance; else if (bestModel) heterogeneousClusterID.Add(i + 1); } // store signatures of potential heterogeneous variants if (heterogeneousClusterID.Count > 0 && bestModel) { foreach (SegmentInfo info in segments) if (heterogeneousClusterID.Contains(info.Cluster.Value)) this.HeterogeneousSegmentsSignature.Add(info.Segment.Begin + info.Segment.End + info.Segment.Counts.Count); } heterogeneousClusters = heterogeneousClusterID.Count; // derive heterogeneity index if (bestModel) { heterogeneityIndex = ComputeHeterogeneityIndex(clusterDeviations, tempDeviation); } else heterogeneityIndex = 0; // write cluster deviations if (debugPathClusterInfo != null) { using (StreamWriter debugWriter = new StreamWriter(debugPathClusterInfo)) { foreach (ClusterModel clusterInfo in clusterDeviations) debugWriter.WriteLine("{0:F3}\t{1:F3}", clusterInfo.ClusterMedianDistance, clusterInfo.ClusterDistanceIQR); } } return clusterDeviation; }