public SAMAlignedSequence ReadSAMAlignedSequence() { if (_parser.IsEOF()) { return(null); } return(_parser.GetAlignedSequence(true)); }
// Search the BAM file for the next valid read aligned against the current contig. // Update read/base pairs statistics. private void Search_Reads(BAMParser parser, ref SAMAlignedSequence next_alignment, string contig_name, ref long number_of_aligned_reads, ref long number_of_aligned_base_pairs, ref long number_of_used_reads, ref long number_of_used_base_pairs, Queue <Padded_Read> read_queue, long current_position) { while (next_alignment != null && !next_alignment.IsDummyRead && next_alignment.RName == contig_name && (next_alignment.Pos - 1) == current_position) { // The next alignment overlaps with current position, so continue. number_of_aligned_reads++; number_of_aligned_base_pairs += next_alignment.QuerySequence.Count; // Maybe we should let the mininum alignment quality be a parameter. // We currently leave it for the user to pre-filter the BAM file. if (next_alignment.MapQ > 0) { number_of_used_reads++; number_of_used_base_pairs += next_alignment.QuerySequence.Count; read_queue.Enqueue(new Padded_Read(next_alignment)); } #region Parse BAM file until next alignment is found if (!parser.IsEOF()) { next_alignment = parser.GetAlignedSequence(true); while ((next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead) && !parser.IsEOF()) { next_alignment = parser.GetAlignedSequence(true); } } else { next_alignment = null; } #endregion Parse BAM file until next alignment is found } }
/// <summary> /// Gets Aligned seqeunces in the Specified BAM file. /// </summary> /// <param name="textReader">BAM file stream.</param> private IEnumerable <SAMAlignedSequence> GetAlignedSequence(Stream bamStream) { bool isFilterRequired = IsFilterApplied(); bool display = true; while (!bamparser.IsEOF()) { SAMAlignedSequence alignedSequence = bamparser.GetAlignedSequence(false); if (isFilterRequired) { display = Filter(alignedSequence); } if (display) { yield return(alignedSequence); } } }
/// <summary> /// Gets Aligned sequences in the Specified BAM file. /// </summary> /// <param name="bamStream"></param> private IEnumerable <SAMAlignedSequence> GetAlignedSequence(Stream bamStream) { bool isFilterRequired = IsFilterApplied(); bool display = true; while (!bamparser.IsEOF()) { SAMAlignedSequence alignedSequence = bamparser.GetAlignedSequence(false); //TODO: The parser should probably never return a null sequence //this may be a band aid over a lurking problem, fix in future if (alignedSequence != null) { if (isFilterRequired) { display = Filter(alignedSequence); } if (display) { yield return(alignedSequence); } } } }
/// <summary> /// Run ConPADE on each contig of the input BAM file. /// </summary> /// <param name="bamName">Name of the input BAM file.</param> public void RunFile(string bamName) { // Current implementation requires that minimum ploidy be 1 int min_ploidy = 1; int number_of_ploidies = max_ploidy - min_ploidy + 1; // Set nucleotide proportions (genotypes) double[][][] nuc_props = Nuc_Props(min_ploidy, number_of_ploidies); // Set dosage probabilities double SNP_density = (double)1 / snpDens; double no_SNP_prob = Math.Log((1 - SNP_density) / 2); double[][] dose_probs = Dose_Probs(min_ploidy, number_of_ploidies, SNP_density, no_SNP_prob); // Set HiSeq error model double[, , , ,] log_probs = Error_Probs(); // Set substitution model double[, ,] log_subst_probs = Subst_Probs(); // Set SNP calling probability double log_SNP_thres = SNPthres * Math.Log(10) / -10; Stopwatch clock = new Stopwatch(); Console.WriteLine("Program started at {0}\n", DateTime.Now); Stream bam_stream = new FileStream(bamName, FileMode.Open, FileAccess.Read); BAMParser parser = new BAMParser(); SAMAlignmentHeader header = parser.GetHeader(bam_stream); string temp = Path.GetFileNameWithoutExtension(bamName); // Find first valid alignment in BAM file SAMAlignedSequence next_alignment = parser.GetAlignedSequence(true); while (next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead) { next_alignment = parser.GetAlignedSequence(true); } TextWriter writer_log_like = null; TextWriter writer_SNP = null; TextWriter writer_ploidy = null; TextWriter writer_reads = null; // Create global output files and write headers. if (!splitContigs) { string SNP_file = temp + "_SNP.txt"; writer_SNP = new StreamWriter(SNP_file); writer_SNP.WriteLine("Contig\tPosition\tAlleles\tCounts\tDosage\tPhredQuality"); string ploidy_file = temp + "_ploidy.txt"; writer_ploidy = new StreamWriter(ploidy_file); writer_ploidy.Write("Contig\tBestPloidy"); for (int i = 0; i < number_of_ploidies; i++) { writer_ploidy.Write("\tlogLike_M{0}", i + min_ploidy); } writer_ploidy.WriteLine(""); string reads_file = temp + "_readStats.txt"; writer_reads = new StreamWriter(reads_file); writer_reads.WriteLine("Contig\tAlignedReads\tAlignedBases\tUsedReads\tUsedBases"); } // Run over each contig in input BAM file. int contig_ind = -1; while (next_alignment != null && next_alignment.RName != "*" && !next_alignment.IsDummyRead) { string contig_name = next_alignment.RName; Console.WriteLine("Started contig {0} at {1}", contig_name, DateTime.Now); clock.Restart(); #region Variables and file handles for current contig long number_of_aligned_reads = 0; long number_of_aligned_base_pairs = 0; long number_of_used_reads = 0; long number_of_used_base_pairs = 0; // Create individual output files for the current contig. if (splitContigs) { string name = temp + "_" + contig_name; string log_like_file = name + "_log_likelihoods.txt"; writer_log_like = new StreamWriter(log_like_file); string SNP_file = name + "_SNP.txt"; writer_SNP = new StreamWriter(SNP_file); string ploidy_file = name + "_ploidy.txt"; writer_ploidy = new StreamWriter(ploidy_file); string reads_file = name + "_readStats.txt"; writer_reads = new StreamWriter(reads_file); } double[] global_log_like = new double[number_of_ploidies]; while (header.ReferenceSequences[++contig_ind].Name != contig_name) { ; } long contig_length = header.ReferenceSequences[contig_ind].Length; // Create a queue to include all reads that overlap with a given position. Queue <Padded_Read> read_queue = new Queue <Padded_Read>(); // Create a queue to include best doses for each tested position. Queue <Best_Dose> dose_queue = new Queue <Best_Dose>((int)contig_length); #endregion Variables and file handles for current contig int positions_to_compute = 0; long current_position = 0; #region Run over every position in contig while (current_position < contig_length) { if ((current_position % 1000000) == 0 && current_position != 0) { Console.WriteLine("At position {0} of {1}", current_position + 1, contig_length); } // Search for reads starting at current position. Search_Reads(parser, ref next_alignment, contig_name, ref number_of_aligned_reads, ref number_of_aligned_base_pairs, ref number_of_used_reads, ref number_of_used_base_pairs, read_queue, current_position); if (read_queue.Count > 0) { positions_to_compute++; // Extract information from each read in queue. byte[] obs_nucs; byte[] is_GG; bool[] reverse; int[] quality_scores; int[] neigh_quality_scores; int[] scores; int[] counts; int k; Extract_Read_Info(read_queue, current_position, out obs_nucs, out is_GG, out reverse, out quality_scores, out neigh_quality_scores, out scores, out counts, out k); // Find two most abundant nucleotides for this position. byte nuc_one; byte nuc_two; Get_Two_Nucs(scores, out nuc_one, out nuc_two); // Calculate Pr(obs|allele1) and Pr(obs|allele2). double[][] log_nuc_probs = Obs_Probs(log_probs, log_subst_probs, obs_nucs, is_GG, reverse, quality_scores, neigh_quality_scores, counts, k, nuc_one, nuc_two); // Calculate log_likelihoods of genotypes for current position. double[][] log_likelihoods = Log_Likelihoods(min_ploidy, max_ploidy, log_nuc_probs, nuc_props); // Calculate log_likelihood of each ploidy and keep most likely allele dosage. Global_Likelihood_Keep_Dose(min_ploidy, number_of_ploidies, dose_probs, global_log_like, dose_queue, current_position, counts, nuc_one, nuc_two, log_likelihoods); } // Remove finished reads from queue. Finished reads no longer overlap with current position. Padded_Read read_to_remove; if (read_queue.Count > 0) { read_to_remove = read_queue.First(); } else { read_to_remove = null; } while (read_to_remove != null && (read_to_remove.alignment.Pos + read_to_remove.alignment_length - 2) < current_position) { read_queue.Dequeue(); if (read_queue.Count > 0) { read_to_remove = read_queue.First(); } else { read_to_remove = null; } } ++current_position; } #endregion Run over every position in contig // Output log_likelihoods. int best_log_like = 0; for (int i = 0; i < number_of_ploidies; i++) { if (global_log_like[i] > global_log_like[best_log_like]) { best_log_like = i; } if (splitContigs) { writer_log_like.WriteLine("Ploidy {0} - log_likelihood {1}", i + min_ploidy, global_log_like[i]); } } // Output most likely ploidy. int best_ploidy = best_log_like + min_ploidy; if (splitContigs) { writer_ploidy.WriteLine(best_ploidy); } else { writer_ploidy.Write("{0}\t{1}", contig_name, best_ploidy); for (int i = 0; i < number_of_ploidies; i++) { writer_ploidy.Write("\t{0}", global_log_like[i]); } writer_ploidy.WriteLine(""); } // Output SNPs. if (splitContigs) { writer_SNP.WriteLine("Position\tAlleles\tCounts\tDosage\tPhredQuality"); } char[] nuc_chars = new char[4] { 'A', 'C', 'G', 'T' }; foreach (Best_Dose cur_doses in dose_queue) { double cur_SNP_posterior = cur_doses.SNP_posterior[best_log_like]; if (cur_SNP_posterior <= log_SNP_thres) { int cur_best_dose = cur_doses.best_dose[best_log_like]; if (cur_best_dose != best_ploidy && cur_best_dose != 0) { if (splitContigs) { writer_SNP.WriteLine("{0}\t{1}|{2}\t{3}|{4}\t{5}\t{6}", cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one, cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10)); } else { writer_SNP.WriteLine("{0}\t{1}\t{2}|{3}\t{4}|{5}\t{6}\t{7}", contig_name, cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one, cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10)); } } } } // Output read statistics. if (splitContigs) { writer_reads.WriteLine("\nNumber of aligned reads: {0}", number_of_aligned_reads); writer_reads.WriteLine("Number of aligned base pairs: {0}", number_of_aligned_base_pairs); writer_reads.WriteLine("\nNumber of used reads: {0}", number_of_used_reads); writer_reads.WriteLine("Number of used base pairs: {0}", number_of_used_base_pairs); } else { writer_reads.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", contig_name, number_of_aligned_reads, number_of_aligned_base_pairs, number_of_used_reads, number_of_used_base_pairs); } if (splitContigs) { writer_log_like.Close(); writer_SNP.Close(); writer_ploidy.Close(); writer_reads.Close(); } clock.Stop(); Console.WriteLine("Time to run contig: {0} s\n", (double)clock.ElapsedMilliseconds / 1000); } if (!splitContigs) { writer_SNP.Close(); writer_ploidy.Close(); writer_reads.Close(); } parser.Dispose(); Console.WriteLine("Finished at {0}\n", DateTime.Now); }