Esempio n. 1
0
 public SAMAlignedSequence ReadSAMAlignedSequence()
 {
     if (_parser.IsEOF())
     {
         return(null);
     }
     return(_parser.GetAlignedSequence(true));
 }
Esempio n. 2
0
        // Search the BAM file for the next valid read aligned against the current contig.
        // Update read/base pairs statistics.
        private void Search_Reads(BAMParser parser, ref SAMAlignedSequence next_alignment, string contig_name,
                                  ref long number_of_aligned_reads, ref long number_of_aligned_base_pairs, ref long number_of_used_reads,
                                  ref long number_of_used_base_pairs, Queue <Padded_Read> read_queue, long current_position)
        {
            while (next_alignment != null &&
                   !next_alignment.IsDummyRead &&
                   next_alignment.RName == contig_name &&
                   (next_alignment.Pos - 1) == current_position)
            {
                // The next alignment overlaps with current position, so continue.
                number_of_aligned_reads++;
                number_of_aligned_base_pairs += next_alignment.QuerySequence.Count;

                // Maybe we should let the mininum alignment quality be a parameter.
                // We currently leave it for the user to pre-filter the BAM file.
                if (next_alignment.MapQ > 0)
                {
                    number_of_used_reads++;
                    number_of_used_base_pairs += next_alignment.QuerySequence.Count;
                    read_queue.Enqueue(new Padded_Read(next_alignment));
                }

                #region Parse BAM file until next alignment is found
                if (!parser.IsEOF())
                {
                    next_alignment = parser.GetAlignedSequence(true);

                    while ((next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead) && !parser.IsEOF())
                    {
                        next_alignment = parser.GetAlignedSequence(true);
                    }
                }
                else
                {
                    next_alignment = null;
                }
                #endregion Parse BAM file until next alignment is found
            }
        }
Esempio n. 3
0
        /// <summary>
        /// Gets Aligned seqeunces in the Specified BAM file.
        /// </summary>
        /// <param name="textReader">BAM file stream.</param>
        private IEnumerable <SAMAlignedSequence> GetAlignedSequence(Stream bamStream)
        {
            bool isFilterRequired = IsFilterApplied();
            bool display          = true;

            while (!bamparser.IsEOF())
            {
                SAMAlignedSequence alignedSequence = bamparser.GetAlignedSequence(false);
                if (isFilterRequired)
                {
                    display = Filter(alignedSequence);
                }

                if (display)
                {
                    yield return(alignedSequence);
                }
            }
        }
Esempio n. 4
0
        /// <summary>
        /// Gets Aligned sequences in the Specified BAM file.
        /// </summary>
        /// <param name="bamStream"></param>
        private IEnumerable <SAMAlignedSequence> GetAlignedSequence(Stream bamStream)
        {
            bool isFilterRequired = IsFilterApplied();
            bool display          = true;

            while (!bamparser.IsEOF())
            {
                SAMAlignedSequence alignedSequence = bamparser.GetAlignedSequence(false);
                //TODO: The parser should probably never return a null sequence
                //this may be a band aid over a lurking problem, fix in future
                if (alignedSequence != null)
                {
                    if (isFilterRequired)
                    {
                        display = Filter(alignedSequence);
                    }

                    if (display)
                    {
                        yield return(alignedSequence);
                    }
                }
            }
        }
Esempio n. 5
0
        /// <summary>
        /// Run ConPADE on each contig of the input BAM file.
        /// </summary>
        /// <param name="bamName">Name of the input BAM file.</param>
        public void RunFile(string bamName)
        {
            // Current implementation requires that minimum ploidy be 1
            int min_ploidy         = 1;
            int number_of_ploidies = max_ploidy - min_ploidy + 1;

            // Set nucleotide proportions (genotypes)
            double[][][] nuc_props = Nuc_Props(min_ploidy, number_of_ploidies);

            // Set dosage probabilities
            double SNP_density = (double)1 / snpDens;
            double no_SNP_prob = Math.Log((1 - SNP_density) / 2);

            double[][] dose_probs = Dose_Probs(min_ploidy, number_of_ploidies, SNP_density, no_SNP_prob);

            // Set HiSeq error model
            double[, , , ,] log_probs = Error_Probs();

            // Set substitution model
            double[, ,] log_subst_probs = Subst_Probs();

            // Set SNP calling probability
            double log_SNP_thres = SNPthres * Math.Log(10) / -10;

            Stopwatch clock = new Stopwatch();

            Console.WriteLine("Program started at {0}\n", DateTime.Now);

            Stream             bam_stream = new FileStream(bamName, FileMode.Open, FileAccess.Read);
            BAMParser          parser     = new BAMParser();
            SAMAlignmentHeader header     = parser.GetHeader(bam_stream);
            string             temp       = Path.GetFileNameWithoutExtension(bamName);

            // Find first valid alignment in BAM file
            SAMAlignedSequence next_alignment = parser.GetAlignedSequence(true);

            while (next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead)
            {
                next_alignment = parser.GetAlignedSequence(true);
            }

            TextWriter writer_log_like = null;
            TextWriter writer_SNP      = null;
            TextWriter writer_ploidy   = null;
            TextWriter writer_reads    = null;

            // Create global output files and write headers.
            if (!splitContigs)
            {
                string SNP_file = temp + "_SNP.txt";
                writer_SNP = new StreamWriter(SNP_file);
                writer_SNP.WriteLine("Contig\tPosition\tAlleles\tCounts\tDosage\tPhredQuality");

                string ploidy_file = temp + "_ploidy.txt";
                writer_ploidy = new StreamWriter(ploidy_file);
                writer_ploidy.Write("Contig\tBestPloidy");
                for (int i = 0; i < number_of_ploidies; i++)
                {
                    writer_ploidy.Write("\tlogLike_M{0}", i + min_ploidy);
                }
                writer_ploidy.WriteLine("");

                string reads_file = temp + "_readStats.txt";
                writer_reads = new StreamWriter(reads_file);
                writer_reads.WriteLine("Contig\tAlignedReads\tAlignedBases\tUsedReads\tUsedBases");
            }

            // Run over each contig in input BAM file.
            int contig_ind = -1;

            while (next_alignment != null && next_alignment.RName != "*" && !next_alignment.IsDummyRead)
            {
                string contig_name = next_alignment.RName;

                Console.WriteLine("Started contig {0} at {1}",
                                  contig_name, DateTime.Now);

                clock.Restart();

                #region Variables and file handles for current contig
                long number_of_aligned_reads      = 0;
                long number_of_aligned_base_pairs = 0;
                long number_of_used_reads         = 0;
                long number_of_used_base_pairs    = 0;

                // Create individual output files for the current contig.
                if (splitContigs)
                {
                    string name          = temp + "_" + contig_name;
                    string log_like_file = name + "_log_likelihoods.txt";
                    writer_log_like = new StreamWriter(log_like_file);

                    string SNP_file = name + "_SNP.txt";
                    writer_SNP = new StreamWriter(SNP_file);

                    string ploidy_file = name + "_ploidy.txt";
                    writer_ploidy = new StreamWriter(ploidy_file);

                    string reads_file = name + "_readStats.txt";
                    writer_reads = new StreamWriter(reads_file);
                }

                double[] global_log_like = new double[number_of_ploidies];

                while (header.ReferenceSequences[++contig_ind].Name != contig_name)
                {
                    ;
                }
                long contig_length = header.ReferenceSequences[contig_ind].Length;

                // Create a queue to include all reads that overlap with a given position.
                Queue <Padded_Read> read_queue = new Queue <Padded_Read>();

                // Create a queue to include best doses for each tested position.
                Queue <Best_Dose> dose_queue = new Queue <Best_Dose>((int)contig_length);
                #endregion Variables and file handles  for current contig

                int  positions_to_compute = 0;
                long current_position     = 0;

                #region Run over every position in contig
                while (current_position < contig_length)
                {
                    if ((current_position % 1000000) == 0 && current_position != 0)
                    {
                        Console.WriteLine("At position {0} of {1}", current_position + 1, contig_length);
                    }

                    // Search for reads starting at current position.
                    Search_Reads(parser, ref next_alignment, contig_name, ref number_of_aligned_reads,
                                 ref number_of_aligned_base_pairs, ref number_of_used_reads, ref number_of_used_base_pairs,
                                 read_queue, current_position);

                    if (read_queue.Count > 0)
                    {
                        positions_to_compute++;

                        // Extract information from each read in queue.
                        byte[] obs_nucs;
                        byte[] is_GG;
                        bool[] reverse;
                        int[]  quality_scores;
                        int[]  neigh_quality_scores;
                        int[]  scores;
                        int[]  counts;
                        int    k;

                        Extract_Read_Info(read_queue, current_position, out obs_nucs, out is_GG, out reverse,
                                          out quality_scores, out neigh_quality_scores, out scores, out counts, out k);

                        // Find two most abundant nucleotides for this position.
                        byte nuc_one;
                        byte nuc_two;
                        Get_Two_Nucs(scores, out nuc_one, out nuc_two);

                        // Calculate Pr(obs|allele1) and Pr(obs|allele2).
                        double[][] log_nuc_probs = Obs_Probs(log_probs, log_subst_probs, obs_nucs, is_GG, reverse,
                                                             quality_scores, neigh_quality_scores, counts, k, nuc_one, nuc_two);

                        // Calculate log_likelihoods of genotypes for current position.
                        double[][] log_likelihoods = Log_Likelihoods(min_ploidy, max_ploidy, log_nuc_probs, nuc_props);

                        // Calculate log_likelihood of each ploidy and keep most likely allele dosage.
                        Global_Likelihood_Keep_Dose(min_ploidy, number_of_ploidies, dose_probs, global_log_like,
                                                    dose_queue, current_position, counts, nuc_one, nuc_two, log_likelihoods);
                    }

                    // Remove finished reads from queue. Finished reads no longer overlap with current position.
                    Padded_Read read_to_remove;
                    if (read_queue.Count > 0)
                    {
                        read_to_remove = read_queue.First();
                    }
                    else
                    {
                        read_to_remove = null;
                    }

                    while (read_to_remove != null &&
                           (read_to_remove.alignment.Pos + read_to_remove.alignment_length - 2) < current_position)
                    {
                        read_queue.Dequeue();
                        if (read_queue.Count > 0)
                        {
                            read_to_remove = read_queue.First();
                        }
                        else
                        {
                            read_to_remove = null;
                        }
                    }

                    ++current_position;
                }
                #endregion Run over every position in contig

                // Output log_likelihoods.
                int best_log_like = 0;
                for (int i = 0; i < number_of_ploidies; i++)
                {
                    if (global_log_like[i] > global_log_like[best_log_like])
                    {
                        best_log_like = i;
                    }

                    if (splitContigs)
                    {
                        writer_log_like.WriteLine("Ploidy {0} - log_likelihood {1}", i + min_ploidy, global_log_like[i]);
                    }
                }

                // Output most likely ploidy.
                int best_ploidy = best_log_like + min_ploidy;
                if (splitContigs)
                {
                    writer_ploidy.WriteLine(best_ploidy);
                }
                else
                {
                    writer_ploidy.Write("{0}\t{1}", contig_name, best_ploidy);
                    for (int i = 0; i < number_of_ploidies; i++)
                    {
                        writer_ploidy.Write("\t{0}", global_log_like[i]);
                    }
                    writer_ploidy.WriteLine("");
                }

                // Output SNPs.
                if (splitContigs)
                {
                    writer_SNP.WriteLine("Position\tAlleles\tCounts\tDosage\tPhredQuality");
                }
                char[] nuc_chars = new char[4] {
                    'A', 'C', 'G', 'T'
                };
                foreach (Best_Dose cur_doses in dose_queue)
                {
                    double cur_SNP_posterior = cur_doses.SNP_posterior[best_log_like];
                    if (cur_SNP_posterior <= log_SNP_thres)
                    {
                        int cur_best_dose = cur_doses.best_dose[best_log_like];
                        if (cur_best_dose != best_ploidy && cur_best_dose != 0)
                        {
                            if (splitContigs)
                            {
                                writer_SNP.WriteLine("{0}\t{1}|{2}\t{3}|{4}\t{5}\t{6}", cur_doses.position + 1,
                                                     nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one,
                                                     cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10));
                            }
                            else
                            {
                                writer_SNP.WriteLine("{0}\t{1}\t{2}|{3}\t{4}|{5}\t{6}\t{7}", contig_name,
                                                     cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two],
                                                     cur_doses.count_one, cur_doses.count_two, cur_best_dose,
                                                     -10 * cur_SNP_posterior / Math.Log(10));
                            }
                        }
                    }
                }

                // Output read statistics.
                if (splitContigs)
                {
                    writer_reads.WriteLine("\nNumber of aligned reads: {0}", number_of_aligned_reads);
                    writer_reads.WriteLine("Number of aligned base pairs: {0}", number_of_aligned_base_pairs);
                    writer_reads.WriteLine("\nNumber of used reads: {0}", number_of_used_reads);
                    writer_reads.WriteLine("Number of used base pairs: {0}", number_of_used_base_pairs);
                }
                else
                {
                    writer_reads.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", contig_name, number_of_aligned_reads,
                                           number_of_aligned_base_pairs, number_of_used_reads, number_of_used_base_pairs);
                }

                if (splitContigs)
                {
                    writer_log_like.Close();
                    writer_SNP.Close();
                    writer_ploidy.Close();
                    writer_reads.Close();
                }

                clock.Stop();
                Console.WriteLine("Time to run contig: {0} s\n", (double)clock.ElapsedMilliseconds / 1000);
            }

            if (!splitContigs)
            {
                writer_SNP.Close();
                writer_ploidy.Close();
                writer_reads.Close();
            }

            parser.Dispose();
            Console.WriteLine("Finished at {0}\n", DateTime.Now);
        }