/// <summary> /// Calculate metric for a single cluser of sequences (all stored sequences), /// and write metric data to file/s if required. /// </summary> public void ProcessSequences() { bool isGood = true; if (allSequences != null && allSequences.Count > 0 && !isComplete) // do the following only if there are sequences to be processed { clusterCount++; ClusterMetric metric = new ClusterMetric(expectedPloidy, numSamples); // Initialise metric output file/s InitMetricOutputFiles(); // Initialise bam output file/s InitBamOutputFiles(); // Perform core metric calculations on cluster sequences metric.Calculate(allSequences); isGood = GoodOrBad(metric); // Get haplotype information if(haplotypingEnabled && expectedPloidy == 2) { GetHaplotypeInfo(ref metric, ref isGood); } if(isGood) { ++goodCount; } Console.WriteLine(metric.ToString() + "\t" + (isGood ? Properties.Resources.GOOD_CLUSTER : Properties.Resources.BAD_CLUSTER)); // Get statistics from the metric for this new cluster CreateSummaryArrays(metric, isGood); SetOverviewStats(metric, isGood); // Output sequences to metric file/s and/or filtered bam file WriteToMetricOutputFiles(metric, isGood); AddToOutputBamQueueOrDispose(metric, isGood); // If the bam file is not currently being written to, and there are sequences in the queue ready to be // written, launch a new thread to perform the writing to file if (writeToFilteredBam && canWriteToBam && bamOutputQueue.Count > 0) { canWriteToBam = false; ClusterDelegate runner = new ClusterDelegate(WriteToBam); runner.BeginInvoke(null, null); } } // Now that all processing has been performed for the current cluster, if the handler has been // aborted, perform any final file outputs if(aborted) { SetComplete(false); } }
/// <summary> /// Write PHASE input data for metric to file /// </summary> /// <param name="metric"></param> private void WritePhaseGenotypeInput(ClusterMetric metric) { int numIndividuals = metric.CountSamples; string locusType = metric.PhaseLoci; // S for a biallelic (SNP) locus; M for microsatellite, or other multi-allelic locus (eg tri-allelic SNP, or HLA allele). int numLoci = locusType.Length; string data = metric.PhaseData; string lines = numIndividuals + "\r\n" + numLoci + "\r\n" + //"P 300 1313 1500 2023 5635\r\n"+ // position. this is optional and does not apply for sample dataset locusType + "\r\n" + data; // While the files cannot be deleted, either the previous phase.exe process is still writing to/reading from // one of these files, or the user has one open while (!DeletePhaseFiles()) { Console.WriteLine(Properties.Resources.PHASE_ERROR_DELETE); Thread.Sleep(20000); // sleep 20 seconds before trying again } using (StreamWriter file = new System.IO.StreamWriter(fileName + "\\genotypes.inp")) { file.WriteLine(lines); if(writeGenotypesFile) { if (genotypesStream == null) { genotypesStream = new System.IO.StreamWriter(fileName + "\\genotypes.txt"); } genotypesStream.WriteLine(lines); } } }
/// <summary> /// Write metrics to all per-cluster metric files /// </summary> private void WriteToMetricOutputFiles(ClusterMetric metric, bool isGood) { if (writeClusterMetricOriginal) { formatterOriginalFile.Write(metric); } if (writeClusterMetricFiltered && isGood) { formatterFilteredFile.Write(metric); } }
/// <summary> /// Set or update overview stats /// (Totals are used to enable easy obtaining of average without iterating through lists to count each time) /// </summary> private void SetOverviewStats(ClusterMetric metric, bool isGood) { // greatest number of samples found so far in any one cluster maxSampleCount = (metric.CountSamples > maxSampleCount) ? metric.CountSamples : maxSampleCount; readCountTotal += metric.CountAll; readCountDistinctTotal += metric.CountDistinct; if(isGood) { readCountGood += metric.CountAll; readCountDistinctGood += metric.CountDistinct; } // maximum quality value found so far in any one cluster maxMapQuality = (metric.AlignmentQuality > maxMapQuality) ? metric.AlignmentQuality : maxMapQuality; maxReadQuality = (metric.ReadQuality > maxReadQuality) ? metric.ReadQuality : maxReadQuality; // set totals for all clusters totalDirt += metric.Dirt; totalMapQ += metric.AlignmentQuality; totalReadQ += metric.ReadQuality; // set totals for good clusters totalDirtGood = (isGood) ? totalDirtGood + metric.Dirt : totalDirtGood; totalMapQGood += (isGood) ? metric.AlignmentQuality : 0; totalReadQGood += (isGood) ? metric.ReadQuality : 0; // running averages for all clusters averageDirt = Math.Round(totalDirt / (double)numberClustersParsed, 2); averageMapQ = Math.Round(totalMapQ / (double)numberClustersParsed, 2); averageReadQ = Math.Round(totalReadQ / (double)numberClustersParsed, 2); // running averages for good clusters averageDirtGood = (goodCount > 0) ? Math.Round(totalDirtGood / (double)goodCount, 2) : 0; averageMapQGood = (goodCount > 0) ? Math.Round(totalMapQGood / (double)goodCount, 2) : 0; averageReadQGood = (goodCount > 0) ? Math.Round(totalReadQGood / (double)goodCount, 2) : 0; }
/// <summary> /// Given a populated and calculated metric, determine based on handler's filter criteria whether /// that cluster is good or bad. Returns true for good, false for bad /// </summary> private bool GoodOrBad(ClusterMetric tempMetric) { if (tempMetric.Dirt > dirtCutoff || tempMetric.AlignmentQuality < alignQualCutoff || tempMetric.ReadQuality < readQualCutoff || tempMetric.PopulationPercentage < populationPercentageCutoff || tempMetric.PloidyDisagreement > ploidyDisagreementCutoff) { tempMetric.Good = false; } else { tempMetric.Good = true; } return tempMetric.Good; }
/// <summary> /// If a cluster is good, runs phase to get haplotype count for that cluster. If the number of haplotypes /// found is > hapMaxCutoff, sets isGood to false. The number of haplotypes found is also passed /// to the metric /// </summary> private void GetHaplotypeInfo(ref ClusterMetric metric, ref bool isGood) { int numHap; if (!onlyHaplotypeGood || onlyHaplotypeGood && isGood) { WritePhaseGenotypeInput(metric); numHap = CalculateClusterHaplotypes(); if (numHap > hapMaxCutoff) { Console.WriteLine(Properties.Resources.HAPLOTYPE_COUNT_BAD + numHap); isGood = false; metric.Good = false; } } else { numHap = -1; } metric.NumberOfHaplotypes = numHap; }
/// <summary> /// Get various data arrays from metric, representing summary details for all clusters so far /// </summary> private void CreateSummaryArrays(ClusterMetric metric, bool isGood) { clustSeqFrequencies.Add(metric.ClusterSequenceFrequencies); if(isGood) { clustSeqFrequenciesGood.Add(metric.ClusterSequenceFrequencies); } AddFxAverages(clusterSequenceFrequenciesOverview, metric.ClusterSequenceFrequencies); if (isGood) { AddFxAverages(clusterSequenceFrequenciesOverviewGood, metric.ClusterSequenceFrequencies); } SetDictValueCounts(graphDataAllReads, metric.CountAll); if (isGood) { SetDictValueCounts(graphDataAllReadsGood, metric.CountAll); } SetDictValueCounts(graphDataDistinctReads, metric.CountDistinct); if (isGood) { SetDictValueCounts(graphDataDistinctReadsGood, metric.CountDistinct); } SetDictValueCounts(graphDataIndividualsCounts, metric.CountSamples); if(isGood) { SetDictValueCounts(graphDataIndividualsCountsGood, metric.CountSamples); } int key = (int)Math.Round(metric.SampleReadCountsDistinct.Average(), 1); SetDictValueCounts(graphDataIndividualsDistinctReadcounts, key); if (isGood) { SetDictValueCounts(graphDataIndividualsDistinctReadcountsGood, key); } key = (int)Math.Round(metric.SampleReadCountsAll.Average(), 1); SetDictValueCounts(graphDataIndividualsTotalReadcounts, key); if (isGood) { SetDictValueCounts(graphDataIndividualsTotalReadcountsGood, key); } }
/// <summary> /// If metric is good, add it to the output queue (ready to be written to new BAM file) /// </summary> private void AddToOutputBamQueueOrDispose(ClusterMetric metric, bool isGood) { if (writeToFilteredBam && isGood) { // If the output queue has too many sequences in it, wait for the bam writer to catch up // and prevent memory fault if (bamOutputQueue.Count >= OUTPUT_QUEUE_SIZE) { Console.WriteLine(Properties.Resources.PROCESSING_THREAD_PAUSED); while (bamOutputQueue.Count >= OUTPUT_QUEUE_SIZE / 2) { Thread.Sleep(10000); // sleep 10 seconds } } // Add sequences to output file queue and output file header bamOutputQueue.Enqueue(allSequences); AddToHeader(allSequences[0]); } else { metric.Reset(); metric = null; } }