Stores information about clustering of CanvasSegment objects
Пример #1
0
        /// <summary>
        /// Helper function for ModelDeviation. Outputs
        /// estimates of average cluster deviation.
        /// </summary>
        protected double ClusterDeviation(List<SegmentInfo> segments, int numClusters, double tempDeviation, out int heterogeneousClusters, out double heterogeneityIndex, bool bestModel, string debugPathClusterInfo = null)
        {
            // compute average deviation for each cluster (clusterDeviation)
            List<ClusterModel> clusterDeviations = new List<ClusterModel>();
            for (int clusterID = 0; clusterID < numClusters; clusterID++)
            {
                ClusterModel clusterInfo = new ClusterModel();
                clusterInfo.ClusterID = clusterID + 1;
                clusterDeviations.Add(clusterInfo);
            }

            // populate clusters
            foreach (SegmentInfo info in segments)
            {
                if (info.Cluster.Value != -1 && info.MAF != -1)
                    clusterDeviations[info.Cluster.Value - 1].ClusterDistances.Add(Tuple.Create(Convert.ToSingle(info.Distance), Convert.ToSingle(info.Weight)));
            }

            // compute cluster mean and standard deviation
            for (int clusterID = 0; clusterID < numClusters; clusterID++)
            {
                clusterDeviations[clusterID].ClusterMedianDistance = Convert.ToDouble(CanvasCommon.Utilities.WeightedMedian(clusterDeviations[clusterID].ClusterDistances));
                clusterDeviations[clusterID].ClusterDistanceIQR = Convert.ToDouble(CanvasCommon.Utilities.WeightedIQR(clusterDeviations[clusterID].ClusterDistances));
            }
            

            // exlcude clusters with deviation larger than 1.5 of average deviation
            // these clusters locate far from expected model centroids and most likely represent segments coming from heterogeneous variants 
            double clusterDeviation = 0;
            List<double> heterogeneousClusterID = new List<double>();
            for (int i = 0; i < numClusters; i++)
            {
                foreach (ClusterModel clusterInfo in clusterDeviations)
                    if (clusterInfo.ClusterMedianDistance < tempDeviation * 1.5f)
                        clusterDeviation += clusterInfo.ClusterMedianDistance;
                    else if (bestModel)
                        heterogeneousClusterID.Add(i + 1);
            }

            // store signatures of potential heterogeneous variants 
            if (heterogeneousClusterID.Count > 0 && bestModel)
            {
                foreach (SegmentInfo info in segments)
                    if (heterogeneousClusterID.Contains(info.Cluster.Value))
                        this.HeterogeneousSegmentsSignature.Add(info.Segment.Begin + info.Segment.End + info.Segment.Counts.Count);
            }
            heterogeneousClusters = heterogeneousClusterID.Count;

            // derive heterogeneity index
            if (bestModel)
            {
                heterogeneityIndex = ComputeHeterogeneityIndex(clusterDeviations, tempDeviation);
            }
            else
                heterogeneityIndex = 0;

            //  write cluster deviations
            if (debugPathClusterInfo != null)
            {
                using (StreamWriter debugWriter = new StreamWriter(debugPathClusterInfo))
                {                
                    foreach (ClusterModel clusterInfo in clusterDeviations)
                        debugWriter.WriteLine("{0:F3}\t{1:F3}", clusterInfo.ClusterMedianDistance, clusterInfo.ClusterDistanceIQR);
                }
            }
            return clusterDeviation;
        }