/// <summary> /// Converts the input BAM to SAM file format. /// </summary> private void ConvertFromBAMToSAM() { using (Stream stream = new FileStream(InputFilePath, FileMode.Open, FileAccess.Read)) { SAMAlignmentHeader header = null; try { header = bamparser.GetHeader(stream); } catch (Exception ex) { throw new InvalidOperationException(Resources.InvalidBAMFile, ex); } WriteHeader(header); if (!HeaderOnly) { if (!string.IsNullOrEmpty(Library)) { rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList(); } foreach (SAMAlignedSequence alignedSequence in GetAlignedSequence(stream)) { WriteAlignedSequence(header, alignedSequence); } } } }
/// <summary> /// Read input file header and set progress bar based on number of sequences in input file /// </summary> private void ReadHeader(string filename, BAMParser parser, int numClustersInInputFile) { using (Stream readStream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read)) { metricHandler.InputHeader = parser.GetHeader(readStream); numClustersInInputFile = metricHandler.InputHeader.ReferenceSequences.Count; Console.WriteLine(Properties.Resources.CLUSTER_COUNT_DISPLAY + numClustersInInputFile); } Dispatcher.BeginInvoke(System.Windows.Threading.DispatcherPriority.Normal, new IntDelegate(SetProgressBar), numClustersInInputFile); }
public List <string> ReadHeaders() { var header = _parser.GetHeader(_stream); var rfs = (from h in header.RecordFields select string.Format("@{0}\t{1}", h.Typecode, (from t in h.Tags select string.Format("{0}:{1}", t.Tag, t.Value)).Merge("\t"))).ToList(); var sqs = (from s in header.ReferenceSequences select string.Format("@SQ\tSN:{0}\tLN:{1}", s.Name, s.Length)).ToList(); return(rfs.Union(sqs).Union(header.Comments).ToList()); }
void validateInputFileAndLoadSampleNames() { if (!File.Exists(InputFilename)) { throw new FileNotFoundException("Could not find file: " + InputFilename); } using (Stream stream = new FileStream(InputFilename, FileMode.Open, FileAccess.Read)) { BAMParser bp = new BAMParser(); header = bp.GetHeader(stream); var tmp = header.RecordFields.Where(x => x.Typecode == "RG").ToList(); sampleNames = tmp.Select(z => z.Tags.Where(p => p.Tag == "ID").First()).Select(z => z.Value).ToList(); NumberOfReadGroups = sampleNames.Count; NumerOfSequences = header.ReferenceSequences.Count; } Console.WriteLine("Processing file with " + sampleNames.Count.ToString() + " samples and " + NumerOfSequences.ToString() + " reference sequences."); }
/// <summary> /// Parse the CCS reads in a PacBio CCS BAM File. /// </summary> /// <returns>The CCS reads.</returns> /// <param name="stream">A stream to parse.</param> public static IEnumerable <PacBioCCSRead> Parse(Stream stream) { if (stream == null) { throw new ArgumentNullException("stream"); } BAMParser bp = new BAMParser(); var header = bp.GetHeader(stream); var field = header.RecordFields.ToList(); var pg = field.Where(p => p.Typecode == "PG").FirstOrDefault(); if (pg == null) { throw new ArgumentException("BAM file did not contain a 'PG' tag in header"); } var cl = pg.Tags.Where(z => z.Tag == "CL").FirstOrDefault(); if (cl == null) { throw new ArgumentException("BAM file did not contain a 'CL' tag within the 'PG' group in header."); } var cmd = cl.Value; if (!cmd.StartsWith("ccs")) { throw new ArgumentException("This is not a BAM file produced by the ccs command."); } stream.Seek(0, SeekOrigin.Begin); foreach (var s in bp.Parse(stream)) { yield return(new PacBioCCSRead(s as SAMAlignedSequence)); } }
/// <summary> /// Extract/print all or sub alignments in SAM or BAM format. /// By default, this command assumes the file on the command line is in /// BAM format and it prints the alignments in SAM. /// SAMUtil.exe view in.bam /// </summary> public void ViewResult() { try { if (string.IsNullOrEmpty(InputFilePath)) { throw new InvalidOperationException("Input File Not specified"); } if (!string.IsNullOrEmpty(Region)) { StringToRegionConverter(); } Initialize(); SAMAlignmentHeader header = null; if (!SAMInput) { Stream stream = new FileStream(InputFilePath, FileMode.Open, FileAccess.Read); try { header = bamparser.GetHeader(stream); } catch { throw new InvalidOperationException(Resources.InvalidBAMFile); } WriteHeader(header); if (!HeaderOnly) { if (!string.IsNullOrEmpty(Library)) { rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList(); } foreach (SAMAlignedSequence alignedSequence in GetAlignedSequence(stream)) { WriteAlignedSequence(header, alignedSequence); } } } else { try { header = SAMParser.ParseSAMHeader(InputFilePath); } catch { throw new InvalidOperationException(Resources.InvalidSAMFile); } if (header == null) { throw new InvalidOperationException("SAM file doesn't contian header"); } WriteHeader(header); if (!HeaderOnly) { if (!string.IsNullOrEmpty(Library)) { rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList(); } using (StreamReader textReader = new StreamReader(InputFilePath)) { foreach (SAMAlignedSequence alignedSeq in GetAlignedSequence(textReader)) { WriteAlignedSequence(header, alignedSeq); } } } if (UnCompressedBAM) { bamUncompressedOutStream.Flush(); if (writer != null) { DisplayBAMContent(bamUncompressedOutStream); } } if (BAMOutput && !UnCompressedBAM) { bamUncompressedOutStream.Flush(); bamUncompressedOutStream.Seek(0, SeekOrigin.Begin); bamformatter.CompressBAMFile(bamUncompressedOutStream, bamCompressedOutStream); bamCompressedOutStream.Flush(); if (writer != null) { DisplayBAMContent(bamCompressedOutStream); } } } } finally { Close(); } }
/// <summary> /// Run ConPADE on each contig of the input BAM file. /// </summary> /// <param name="bamName">Name of the input BAM file.</param> public void RunFile(string bamName) { // Current implementation requires that minimum ploidy be 1 int min_ploidy = 1; int number_of_ploidies = max_ploidy - min_ploidy + 1; // Set nucleotide proportions (genotypes) double[][][] nuc_props = Nuc_Props(min_ploidy, number_of_ploidies); // Set dosage probabilities double SNP_density = (double)1 / snpDens; double no_SNP_prob = Math.Log((1 - SNP_density) / 2); double[][] dose_probs = Dose_Probs(min_ploidy, number_of_ploidies, SNP_density, no_SNP_prob); // Set HiSeq error model double[, , , ,] log_probs = Error_Probs(); // Set substitution model double[, ,] log_subst_probs = Subst_Probs(); // Set SNP calling probability double log_SNP_thres = SNPthres * Math.Log(10) / -10; Stopwatch clock = new Stopwatch(); Console.WriteLine("Program started at {0}\n", DateTime.Now); Stream bam_stream = new FileStream(bamName, FileMode.Open, FileAccess.Read); BAMParser parser = new BAMParser(); SAMAlignmentHeader header = parser.GetHeader(bam_stream); string temp = Path.GetFileNameWithoutExtension(bamName); // Find first valid alignment in BAM file SAMAlignedSequence next_alignment = parser.GetAlignedSequence(true); while (next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead) { next_alignment = parser.GetAlignedSequence(true); } TextWriter writer_log_like = null; TextWriter writer_SNP = null; TextWriter writer_ploidy = null; TextWriter writer_reads = null; // Create global output files and write headers. if (!splitContigs) { string SNP_file = temp + "_SNP.txt"; writer_SNP = new StreamWriter(SNP_file); writer_SNP.WriteLine("Contig\tPosition\tAlleles\tCounts\tDosage\tPhredQuality"); string ploidy_file = temp + "_ploidy.txt"; writer_ploidy = new StreamWriter(ploidy_file); writer_ploidy.Write("Contig\tBestPloidy"); for (int i = 0; i < number_of_ploidies; i++) { writer_ploidy.Write("\tlogLike_M{0}", i + min_ploidy); } writer_ploidy.WriteLine(""); string reads_file = temp + "_readStats.txt"; writer_reads = new StreamWriter(reads_file); writer_reads.WriteLine("Contig\tAlignedReads\tAlignedBases\tUsedReads\tUsedBases"); } // Run over each contig in input BAM file. int contig_ind = -1; while (next_alignment != null && next_alignment.RName != "*" && !next_alignment.IsDummyRead) { string contig_name = next_alignment.RName; Console.WriteLine("Started contig {0} at {1}", contig_name, DateTime.Now); clock.Restart(); #region Variables and file handles for current contig long number_of_aligned_reads = 0; long number_of_aligned_base_pairs = 0; long number_of_used_reads = 0; long number_of_used_base_pairs = 0; // Create individual output files for the current contig. if (splitContigs) { string name = temp + "_" + contig_name; string log_like_file = name + "_log_likelihoods.txt"; writer_log_like = new StreamWriter(log_like_file); string SNP_file = name + "_SNP.txt"; writer_SNP = new StreamWriter(SNP_file); string ploidy_file = name + "_ploidy.txt"; writer_ploidy = new StreamWriter(ploidy_file); string reads_file = name + "_readStats.txt"; writer_reads = new StreamWriter(reads_file); } double[] global_log_like = new double[number_of_ploidies]; while (header.ReferenceSequences[++contig_ind].Name != contig_name) { ; } long contig_length = header.ReferenceSequences[contig_ind].Length; // Create a queue to include all reads that overlap with a given position. Queue <Padded_Read> read_queue = new Queue <Padded_Read>(); // Create a queue to include best doses for each tested position. Queue <Best_Dose> dose_queue = new Queue <Best_Dose>((int)contig_length); #endregion Variables and file handles for current contig int positions_to_compute = 0; long current_position = 0; #region Run over every position in contig while (current_position < contig_length) { if ((current_position % 1000000) == 0 && current_position != 0) { Console.WriteLine("At position {0} of {1}", current_position + 1, contig_length); } // Search for reads starting at current position. Search_Reads(parser, ref next_alignment, contig_name, ref number_of_aligned_reads, ref number_of_aligned_base_pairs, ref number_of_used_reads, ref number_of_used_base_pairs, read_queue, current_position); if (read_queue.Count > 0) { positions_to_compute++; // Extract information from each read in queue. byte[] obs_nucs; byte[] is_GG; bool[] reverse; int[] quality_scores; int[] neigh_quality_scores; int[] scores; int[] counts; int k; Extract_Read_Info(read_queue, current_position, out obs_nucs, out is_GG, out reverse, out quality_scores, out neigh_quality_scores, out scores, out counts, out k); // Find two most abundant nucleotides for this position. byte nuc_one; byte nuc_two; Get_Two_Nucs(scores, out nuc_one, out nuc_two); // Calculate Pr(obs|allele1) and Pr(obs|allele2). double[][] log_nuc_probs = Obs_Probs(log_probs, log_subst_probs, obs_nucs, is_GG, reverse, quality_scores, neigh_quality_scores, counts, k, nuc_one, nuc_two); // Calculate log_likelihoods of genotypes for current position. double[][] log_likelihoods = Log_Likelihoods(min_ploidy, max_ploidy, log_nuc_probs, nuc_props); // Calculate log_likelihood of each ploidy and keep most likely allele dosage. Global_Likelihood_Keep_Dose(min_ploidy, number_of_ploidies, dose_probs, global_log_like, dose_queue, current_position, counts, nuc_one, nuc_two, log_likelihoods); } // Remove finished reads from queue. Finished reads no longer overlap with current position. Padded_Read read_to_remove; if (read_queue.Count > 0) { read_to_remove = read_queue.First(); } else { read_to_remove = null; } while (read_to_remove != null && (read_to_remove.alignment.Pos + read_to_remove.alignment_length - 2) < current_position) { read_queue.Dequeue(); if (read_queue.Count > 0) { read_to_remove = read_queue.First(); } else { read_to_remove = null; } } ++current_position; } #endregion Run over every position in contig // Output log_likelihoods. int best_log_like = 0; for (int i = 0; i < number_of_ploidies; i++) { if (global_log_like[i] > global_log_like[best_log_like]) { best_log_like = i; } if (splitContigs) { writer_log_like.WriteLine("Ploidy {0} - log_likelihood {1}", i + min_ploidy, global_log_like[i]); } } // Output most likely ploidy. int best_ploidy = best_log_like + min_ploidy; if (splitContigs) { writer_ploidy.WriteLine(best_ploidy); } else { writer_ploidy.Write("{0}\t{1}", contig_name, best_ploidy); for (int i = 0; i < number_of_ploidies; i++) { writer_ploidy.Write("\t{0}", global_log_like[i]); } writer_ploidy.WriteLine(""); } // Output SNPs. if (splitContigs) { writer_SNP.WriteLine("Position\tAlleles\tCounts\tDosage\tPhredQuality"); } char[] nuc_chars = new char[4] { 'A', 'C', 'G', 'T' }; foreach (Best_Dose cur_doses in dose_queue) { double cur_SNP_posterior = cur_doses.SNP_posterior[best_log_like]; if (cur_SNP_posterior <= log_SNP_thres) { int cur_best_dose = cur_doses.best_dose[best_log_like]; if (cur_best_dose != best_ploidy && cur_best_dose != 0) { if (splitContigs) { writer_SNP.WriteLine("{0}\t{1}|{2}\t{3}|{4}\t{5}\t{6}", cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one, cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10)); } else { writer_SNP.WriteLine("{0}\t{1}\t{2}|{3}\t{4}|{5}\t{6}\t{7}", contig_name, cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one, cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10)); } } } } // Output read statistics. if (splitContigs) { writer_reads.WriteLine("\nNumber of aligned reads: {0}", number_of_aligned_reads); writer_reads.WriteLine("Number of aligned base pairs: {0}", number_of_aligned_base_pairs); writer_reads.WriteLine("\nNumber of used reads: {0}", number_of_used_reads); writer_reads.WriteLine("Number of used base pairs: {0}", number_of_used_base_pairs); } else { writer_reads.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", contig_name, number_of_aligned_reads, number_of_aligned_base_pairs, number_of_used_reads, number_of_used_base_pairs); } if (splitContigs) { writer_log_like.Close(); writer_SNP.Close(); writer_ploidy.Close(); writer_reads.Close(); } clock.Stop(); Console.WriteLine("Time to run contig: {0} s\n", (double)clock.ElapsedMilliseconds / 1000); } if (!splitContigs) { writer_SNP.Close(); writer_ploidy.Close(); writer_reads.Close(); } parser.Dispose(); Console.WriteLine("Finished at {0}\n", DateTime.Now); }