Ejemplo n.º 1
0
        /// <summary>
        /// Converts the input BAM to SAM file format.
        /// </summary>
        private void ConvertFromBAMToSAM()
        {
            using (Stream stream = new FileStream(InputFilePath, FileMode.Open, FileAccess.Read))
            {
                SAMAlignmentHeader header = null;
                try
                {
                    header = bamparser.GetHeader(stream);
                }
                catch (Exception ex)
                {
                    throw new InvalidOperationException(Resources.InvalidBAMFile, ex);
                }

                WriteHeader(header);

                if (!HeaderOnly)
                {
                    if (!string.IsNullOrEmpty(Library))
                    {
                        rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList();
                    }

                    foreach (SAMAlignedSequence alignedSequence in GetAlignedSequence(stream))
                    {
                        WriteAlignedSequence(header, alignedSequence);
                    }
                }
            }
        }
Ejemplo n.º 2
0
 /// <summary>
 /// Read input file header and set progress bar based on number of sequences in input file
 /// </summary>
 private void ReadHeader(string filename, BAMParser parser, int numClustersInInputFile)
 {
     using (Stream readStream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read))
     {
         metricHandler.InputHeader = parser.GetHeader(readStream);
         numClustersInInputFile    = metricHandler.InputHeader.ReferenceSequences.Count;
         Console.WriteLine(Properties.Resources.CLUSTER_COUNT_DISPLAY + numClustersInInputFile);
     }
     Dispatcher.BeginInvoke(System.Windows.Threading.DispatcherPriority.Normal,
                            new IntDelegate(SetProgressBar), numClustersInInputFile);
 }
Ejemplo n.º 3
0
        public List <string> ReadHeaders()
        {
            var header = _parser.GetHeader(_stream);

            var rfs = (from h in header.RecordFields
                       select string.Format("@{0}\t{1}", h.Typecode, (from t in h.Tags
                                                                      select string.Format("{0}:{1}", t.Tag, t.Value)).Merge("\t"))).ToList();

            var sqs = (from s in header.ReferenceSequences
                       select string.Format("@SQ\tSN:{0}\tLN:{1}", s.Name, s.Length)).ToList();

            return(rfs.Union(sqs).Union(header.Comments).ToList());
        }
Ejemplo n.º 4
0
 void validateInputFileAndLoadSampleNames()
 {
     if (!File.Exists(InputFilename))
     {
         throw new FileNotFoundException("Could not find file: " + InputFilename);
     }
     using (Stream stream = new FileStream(InputFilename, FileMode.Open, FileAccess.Read))
     {
         BAMParser bp = new BAMParser();
         header = bp.GetHeader(stream);
         var tmp = header.RecordFields.Where(x => x.Typecode == "RG").ToList();
         sampleNames        = tmp.Select(z => z.Tags.Where(p => p.Tag == "ID").First()).Select(z => z.Value).ToList();
         NumberOfReadGroups = sampleNames.Count;
         NumerOfSequences   = header.ReferenceSequences.Count;
     }
     Console.WriteLine("Processing file with " + sampleNames.Count.ToString() + " samples and " + NumerOfSequences.ToString() + " reference sequences.");
 }
Ejemplo n.º 5
0
        /// <summary>
        /// Parse the CCS reads in a PacBio CCS BAM File.
        /// </summary>
        /// <returns>The CCS reads.</returns>
        /// <param name="stream">A stream to parse.</param>
        public static IEnumerable <PacBioCCSRead> Parse(Stream stream)
        {
            if (stream == null)
            {
                throw new ArgumentNullException("stream");
            }
            BAMParser bp     = new BAMParser();
            var       header = bp.GetHeader(stream);
            var       field  = header.RecordFields.ToList();

            var pg = field.Where(p => p.Typecode == "PG").FirstOrDefault();

            if (pg == null)
            {
                throw new ArgumentException("BAM file did not contain a 'PG' tag in header");
            }

            var cl = pg.Tags.Where(z => z.Tag == "CL").FirstOrDefault();

            if (cl == null)
            {
                throw new ArgumentException("BAM file did not contain a 'CL' tag within the 'PG' group in header.");
            }

            var cmd = cl.Value;

            if (!cmd.StartsWith("ccs"))
            {
                throw new ArgumentException("This is not a BAM file produced by the ccs command.");
            }



            stream.Seek(0, SeekOrigin.Begin);
            foreach (var s in bp.Parse(stream))
            {
                yield return(new PacBioCCSRead(s as SAMAlignedSequence));
            }
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Extract/print all or sub alignments in SAM or BAM format.
        /// By default, this command assumes the file on the command line is in
        /// BAM format and it prints the alignments in SAM.
        /// SAMUtil.exe view in.bam
        /// </summary>
        public void ViewResult()
        {
            try
            {
                if (string.IsNullOrEmpty(InputFilePath))
                {
                    throw new InvalidOperationException("Input File Not specified");
                }

                if (!string.IsNullOrEmpty(Region))
                {
                    StringToRegionConverter();
                }

                Initialize();
                SAMAlignmentHeader header = null;

                if (!SAMInput)
                {
                    Stream stream = new FileStream(InputFilePath, FileMode.Open, FileAccess.Read);
                    try
                    {
                        header = bamparser.GetHeader(stream);
                    }
                    catch
                    {
                        throw new InvalidOperationException(Resources.InvalidBAMFile);
                    }


                    WriteHeader(header);

                    if (!HeaderOnly)
                    {
                        if (!string.IsNullOrEmpty(Library))
                        {
                            rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList();
                        }

                        foreach (SAMAlignedSequence alignedSequence in GetAlignedSequence(stream))
                        {
                            WriteAlignedSequence(header, alignedSequence);
                        }
                    }
                }
                else
                {
                    try
                    {
                        header = SAMParser.ParseSAMHeader(InputFilePath);
                    }
                    catch
                    {
                        throw new InvalidOperationException(Resources.InvalidSAMFile);
                    }

                    if (header == null)
                    {
                        throw new InvalidOperationException("SAM file doesn't contian header");
                    }

                    WriteHeader(header);

                    if (!HeaderOnly)
                    {
                        if (!string.IsNullOrEmpty(Library))
                        {
                            rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList();
                        }

                        using (StreamReader textReader = new StreamReader(InputFilePath))
                        {
                            foreach (SAMAlignedSequence alignedSeq in GetAlignedSequence(textReader))
                            {
                                WriteAlignedSequence(header, alignedSeq);
                            }
                        }
                    }

                    if (UnCompressedBAM)
                    {
                        bamUncompressedOutStream.Flush();
                        if (writer != null)
                        {
                            DisplayBAMContent(bamUncompressedOutStream);
                        }
                    }

                    if (BAMOutput && !UnCompressedBAM)
                    {
                        bamUncompressedOutStream.Flush();
                        bamUncompressedOutStream.Seek(0, SeekOrigin.Begin);
                        bamformatter.CompressBAMFile(bamUncompressedOutStream, bamCompressedOutStream);
                        bamCompressedOutStream.Flush();
                        if (writer != null)
                        {
                            DisplayBAMContent(bamCompressedOutStream);
                        }
                    }
                }
            }
            finally
            {
                Close();
            }
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Run ConPADE on each contig of the input BAM file.
        /// </summary>
        /// <param name="bamName">Name of the input BAM file.</param>
        public void RunFile(string bamName)
        {
            // Current implementation requires that minimum ploidy be 1
            int min_ploidy         = 1;
            int number_of_ploidies = max_ploidy - min_ploidy + 1;

            // Set nucleotide proportions (genotypes)
            double[][][] nuc_props = Nuc_Props(min_ploidy, number_of_ploidies);

            // Set dosage probabilities
            double SNP_density = (double)1 / snpDens;
            double no_SNP_prob = Math.Log((1 - SNP_density) / 2);

            double[][] dose_probs = Dose_Probs(min_ploidy, number_of_ploidies, SNP_density, no_SNP_prob);

            // Set HiSeq error model
            double[, , , ,] log_probs = Error_Probs();

            // Set substitution model
            double[, ,] log_subst_probs = Subst_Probs();

            // Set SNP calling probability
            double log_SNP_thres = SNPthres * Math.Log(10) / -10;

            Stopwatch clock = new Stopwatch();

            Console.WriteLine("Program started at {0}\n", DateTime.Now);

            Stream             bam_stream = new FileStream(bamName, FileMode.Open, FileAccess.Read);
            BAMParser          parser     = new BAMParser();
            SAMAlignmentHeader header     = parser.GetHeader(bam_stream);
            string             temp       = Path.GetFileNameWithoutExtension(bamName);

            // Find first valid alignment in BAM file
            SAMAlignedSequence next_alignment = parser.GetAlignedSequence(true);

            while (next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead)
            {
                next_alignment = parser.GetAlignedSequence(true);
            }

            TextWriter writer_log_like = null;
            TextWriter writer_SNP      = null;
            TextWriter writer_ploidy   = null;
            TextWriter writer_reads    = null;

            // Create global output files and write headers.
            if (!splitContigs)
            {
                string SNP_file = temp + "_SNP.txt";
                writer_SNP = new StreamWriter(SNP_file);
                writer_SNP.WriteLine("Contig\tPosition\tAlleles\tCounts\tDosage\tPhredQuality");

                string ploidy_file = temp + "_ploidy.txt";
                writer_ploidy = new StreamWriter(ploidy_file);
                writer_ploidy.Write("Contig\tBestPloidy");
                for (int i = 0; i < number_of_ploidies; i++)
                {
                    writer_ploidy.Write("\tlogLike_M{0}", i + min_ploidy);
                }
                writer_ploidy.WriteLine("");

                string reads_file = temp + "_readStats.txt";
                writer_reads = new StreamWriter(reads_file);
                writer_reads.WriteLine("Contig\tAlignedReads\tAlignedBases\tUsedReads\tUsedBases");
            }

            // Run over each contig in input BAM file.
            int contig_ind = -1;

            while (next_alignment != null && next_alignment.RName != "*" && !next_alignment.IsDummyRead)
            {
                string contig_name = next_alignment.RName;

                Console.WriteLine("Started contig {0} at {1}",
                                  contig_name, DateTime.Now);

                clock.Restart();

                #region Variables and file handles for current contig
                long number_of_aligned_reads      = 0;
                long number_of_aligned_base_pairs = 0;
                long number_of_used_reads         = 0;
                long number_of_used_base_pairs    = 0;

                // Create individual output files for the current contig.
                if (splitContigs)
                {
                    string name          = temp + "_" + contig_name;
                    string log_like_file = name + "_log_likelihoods.txt";
                    writer_log_like = new StreamWriter(log_like_file);

                    string SNP_file = name + "_SNP.txt";
                    writer_SNP = new StreamWriter(SNP_file);

                    string ploidy_file = name + "_ploidy.txt";
                    writer_ploidy = new StreamWriter(ploidy_file);

                    string reads_file = name + "_readStats.txt";
                    writer_reads = new StreamWriter(reads_file);
                }

                double[] global_log_like = new double[number_of_ploidies];

                while (header.ReferenceSequences[++contig_ind].Name != contig_name)
                {
                    ;
                }
                long contig_length = header.ReferenceSequences[contig_ind].Length;

                // Create a queue to include all reads that overlap with a given position.
                Queue <Padded_Read> read_queue = new Queue <Padded_Read>();

                // Create a queue to include best doses for each tested position.
                Queue <Best_Dose> dose_queue = new Queue <Best_Dose>((int)contig_length);
                #endregion Variables and file handles  for current contig

                int  positions_to_compute = 0;
                long current_position     = 0;

                #region Run over every position in contig
                while (current_position < contig_length)
                {
                    if ((current_position % 1000000) == 0 && current_position != 0)
                    {
                        Console.WriteLine("At position {0} of {1}", current_position + 1, contig_length);
                    }

                    // Search for reads starting at current position.
                    Search_Reads(parser, ref next_alignment, contig_name, ref number_of_aligned_reads,
                                 ref number_of_aligned_base_pairs, ref number_of_used_reads, ref number_of_used_base_pairs,
                                 read_queue, current_position);

                    if (read_queue.Count > 0)
                    {
                        positions_to_compute++;

                        // Extract information from each read in queue.
                        byte[] obs_nucs;
                        byte[] is_GG;
                        bool[] reverse;
                        int[]  quality_scores;
                        int[]  neigh_quality_scores;
                        int[]  scores;
                        int[]  counts;
                        int    k;

                        Extract_Read_Info(read_queue, current_position, out obs_nucs, out is_GG, out reverse,
                                          out quality_scores, out neigh_quality_scores, out scores, out counts, out k);

                        // Find two most abundant nucleotides for this position.
                        byte nuc_one;
                        byte nuc_two;
                        Get_Two_Nucs(scores, out nuc_one, out nuc_two);

                        // Calculate Pr(obs|allele1) and Pr(obs|allele2).
                        double[][] log_nuc_probs = Obs_Probs(log_probs, log_subst_probs, obs_nucs, is_GG, reverse,
                                                             quality_scores, neigh_quality_scores, counts, k, nuc_one, nuc_two);

                        // Calculate log_likelihoods of genotypes for current position.
                        double[][] log_likelihoods = Log_Likelihoods(min_ploidy, max_ploidy, log_nuc_probs, nuc_props);

                        // Calculate log_likelihood of each ploidy and keep most likely allele dosage.
                        Global_Likelihood_Keep_Dose(min_ploidy, number_of_ploidies, dose_probs, global_log_like,
                                                    dose_queue, current_position, counts, nuc_one, nuc_two, log_likelihoods);
                    }

                    // Remove finished reads from queue. Finished reads no longer overlap with current position.
                    Padded_Read read_to_remove;
                    if (read_queue.Count > 0)
                    {
                        read_to_remove = read_queue.First();
                    }
                    else
                    {
                        read_to_remove = null;
                    }

                    while (read_to_remove != null &&
                           (read_to_remove.alignment.Pos + read_to_remove.alignment_length - 2) < current_position)
                    {
                        read_queue.Dequeue();
                        if (read_queue.Count > 0)
                        {
                            read_to_remove = read_queue.First();
                        }
                        else
                        {
                            read_to_remove = null;
                        }
                    }

                    ++current_position;
                }
                #endregion Run over every position in contig

                // Output log_likelihoods.
                int best_log_like = 0;
                for (int i = 0; i < number_of_ploidies; i++)
                {
                    if (global_log_like[i] > global_log_like[best_log_like])
                    {
                        best_log_like = i;
                    }

                    if (splitContigs)
                    {
                        writer_log_like.WriteLine("Ploidy {0} - log_likelihood {1}", i + min_ploidy, global_log_like[i]);
                    }
                }

                // Output most likely ploidy.
                int best_ploidy = best_log_like + min_ploidy;
                if (splitContigs)
                {
                    writer_ploidy.WriteLine(best_ploidy);
                }
                else
                {
                    writer_ploidy.Write("{0}\t{1}", contig_name, best_ploidy);
                    for (int i = 0; i < number_of_ploidies; i++)
                    {
                        writer_ploidy.Write("\t{0}", global_log_like[i]);
                    }
                    writer_ploidy.WriteLine("");
                }

                // Output SNPs.
                if (splitContigs)
                {
                    writer_SNP.WriteLine("Position\tAlleles\tCounts\tDosage\tPhredQuality");
                }
                char[] nuc_chars = new char[4] {
                    'A', 'C', 'G', 'T'
                };
                foreach (Best_Dose cur_doses in dose_queue)
                {
                    double cur_SNP_posterior = cur_doses.SNP_posterior[best_log_like];
                    if (cur_SNP_posterior <= log_SNP_thres)
                    {
                        int cur_best_dose = cur_doses.best_dose[best_log_like];
                        if (cur_best_dose != best_ploidy && cur_best_dose != 0)
                        {
                            if (splitContigs)
                            {
                                writer_SNP.WriteLine("{0}\t{1}|{2}\t{3}|{4}\t{5}\t{6}", cur_doses.position + 1,
                                                     nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one,
                                                     cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10));
                            }
                            else
                            {
                                writer_SNP.WriteLine("{0}\t{1}\t{2}|{3}\t{4}|{5}\t{6}\t{7}", contig_name,
                                                     cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two],
                                                     cur_doses.count_one, cur_doses.count_two, cur_best_dose,
                                                     -10 * cur_SNP_posterior / Math.Log(10));
                            }
                        }
                    }
                }

                // Output read statistics.
                if (splitContigs)
                {
                    writer_reads.WriteLine("\nNumber of aligned reads: {0}", number_of_aligned_reads);
                    writer_reads.WriteLine("Number of aligned base pairs: {0}", number_of_aligned_base_pairs);
                    writer_reads.WriteLine("\nNumber of used reads: {0}", number_of_used_reads);
                    writer_reads.WriteLine("Number of used base pairs: {0}", number_of_used_base_pairs);
                }
                else
                {
                    writer_reads.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", contig_name, number_of_aligned_reads,
                                           number_of_aligned_base_pairs, number_of_used_reads, number_of_used_base_pairs);
                }

                if (splitContigs)
                {
                    writer_log_like.Close();
                    writer_SNP.Close();
                    writer_ploidy.Close();
                    writer_reads.Close();
                }

                clock.Stop();
                Console.WriteLine("Time to run contig: {0} s\n", (double)clock.ElapsedMilliseconds / 1000);
            }

            if (!splitContigs)
            {
                writer_SNP.Close();
                writer_ploidy.Close();
                writer_reads.Close();
            }

            parser.Dispose();
            Console.WriteLine("Finished at {0}\n", DateTime.Now);
        }