Пример #1
0
        /// <summary>
        /// Reads in a bam file and marks within the BitArrays which genomic mers are present.
        /// </summary>
        /// <param name="bamFile">bam file read alignments from.</param>
        /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param>
        static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths)
        {
            // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome!
            string indexPath = bamFile + ".bai";
            if (!File.Exists(indexPath))
            {
                throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath));
            }

            using (BamReader reader = new BamReader(bamFile))
            {
                int desiredRefIndex = -1;
                desiredRefIndex = reader.GetReferenceIndex(chromosome);
                if (desiredRefIndex == -1)
                {
                    throw new ApplicationException(
                        string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome,
                        bamFile));
                }
                bool result = reader.Jump(desiredRefIndex, 0);
                if (!result)
                {
                    // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this 
                    // .bam file.  That is not uncommon e.g. for truseq amplicon.
                    return;
                }
                int readCount = 0;
                int keptReadCount = 0;
                string header = reader.GetHeader();
                BamAlignment alignment = new BamAlignment();
                while (reader.GetNextAlignment(ref alignment, true))
                {
                    readCount++;

                    // Flag check - Require reads to be aligned, passing filter, non-duplicate:
                    if (!alignment.IsMapped()) continue;
                    if (alignment.IsFailedQC()) continue;
                    if (alignment.IsDuplicate()) continue;
                    if (alignment.IsReverseStrand()) continue;
                    if (!alignment.IsMainAlignment()) continue;

                    // Require the alignment to start with 35 bases of non-indel:
                    if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) continue;

                    if (isPairedEnd && !alignment.IsProperPair()) continue;

                    int refID = alignment.RefID;

                    // quit if the current reference index is different from the desired reference index
                    if (refID != desiredRefIndex)
                        break;

                    if (refID == -1)
                        continue;

                    keptReadCount++;
                    if (coverageMode == CanvasCoverageMode.Binary)
                    {
                        observed.Data[alignment.Position] = 1;
                    }
                    else
                    {
                        observed.Set(alignment.Position);
                    }
                    // store fragment size, make sure it's within Int16 range and is positive (simplification for now)
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                        fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0));
                }
                Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount);
            }
        }
Пример #2
0
        /// <summary>
        /// Step 2: Get the ref and variant allele frequencies for the variants of interest, in the tumor bam file.
        /// </summary>
        protected void ProcessBamFile(string bamPath)
        {
            Console.WriteLine("{0} Looping over bam records from {1}", DateTime.Now, bamPath);
            int overallCount = 0;
            int nextVariantIndex = 0;
            using (BamReader reader = new BamReader(bamPath))
            {
                BamAlignment read = new BamAlignment();
                int refID = reader.GetReferenceIndex(this.Chromosome);
                if (refID < 0)
                {
                    throw new ArgumentException(string.Format("Error: Chromosome name '{0}' does not match bam file at '{1}'", this.Chromosome, bamPath));
                }
                Console.WriteLine("Jump to refid {0} {1}", refID, this.Chromosome);
                reader.Jump(refID, 0);
                while (true)
                {
                    bool result = reader.GetNextAlignment(ref read, false);
                    if (!result) break;
                    if (!read.HasPosition() || read.RefID > refID) break; // We're past our chromosome of interest.
                    if (read.RefID < refID) continue; // We're not yet on our chromosome of interest.
                    overallCount++;
                    if (overallCount % 1000000 == 0)
                    {
                        Console.WriteLine("Record {0} at {1}...", overallCount, read.Position);
                    }

                    // Skip over unaligned or other non-count-worthy reads:
                    if (!read.IsPrimaryAlignment()) continue;
                    if (!read.IsMapped()) continue;
                    if (read.IsDuplicate()) continue;
                    if (read.MapQuality <= MinimumMapQ) continue;

                    // Scan forward through the variants list, to keep up with our reads:
                    while (nextVariantIndex < this.Variants.Count && this.Variants[nextVariantIndex].ReferencePosition < read.Position)
                    {
                        nextVariantIndex++;
                    }
                    if (nextVariantIndex >= this.Variants.Count) break;

                    // If the read doesn't look like it has a reasonable chance of touching the next variant, continue:
                    if (read.Position + 1000 < this.Variants[nextVariantIndex].ReferencePosition) continue;

                    // This read potentially overlaps next variant (and further variants).  Count bases!
                    ProcessReadBases(read, nextVariantIndex);
                }
            }
            Console.WriteLine("Looped over {0} bam records in all", overallCount);
        }
Пример #3
0
            /// <summary>
            /// Bins fragments.
            /// </summary>
            private void binFragments()
            {
                // Sanity check: The BAM index file must exist, in order for us to seek to our target chromosome!
                if (!Bam.Index.Exists)
                {
                    throw new Exception(string.Format("Fatal error: Bam index not found at {0}", Bam.Index.FullName));
                }

                long pairedAlignmentCount = 0; // keep track of paired alignments
                usableFragmentCount = 0;
                using (BamReader reader = new BamReader(Bam.BamFile.FullName))
                {
                    int desiredRefIndex = -1;
                    desiredRefIndex = reader.GetReferenceIndex(Chromosome);
                    if (desiredRefIndex == -1)
                    {
                        throw new ApplicationException(
                            string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", Chromosome, Bam.BamFile.FullName));
                    }
                    bool result = reader.Jump(desiredRefIndex, 0);
                    if (!result)
                    {
                        // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this
                        // .bam file.  That is not uncommon e.g. for truseq amplicon.
                        return;
                    }

                    Dictionary<string, int> readNameToBinIndex = new Dictionary<string, int>();
                    HashSet<string> samePositionReadNames = new HashSet<string>();
                    int binIndexStart = 0;
                    int prevPosition = -1;
                    BamAlignment alignment = new BamAlignment();
                    while (reader.GetNextAlignment(ref alignment, true))
                    {
                        int refID = alignment.RefID;

                        // quit if the current reference index is different from the desired reference index
                        if (refID != desiredRefIndex)
                            break;

                        if (refID == -1)
                            continue;

                        if (alignment.Position < prevPosition) // Make sure the BAM is properly sorted
                        {
                            throw new ApplicationException(
                                string.Format("The alignment on {0} are not properly sorted in {1}: {2}", Chromosome, Bam.BamFile.FullName, alignment.Name));
                        }
                        prevPosition = alignment.Position;

                        if (alignment.IsPaired()) { pairedAlignmentCount++; }

                        BinOneAlignment(alignment, FragmentBinnerConstants.MappingQualityThreshold, readNameToBinIndex,
                            samePositionReadNames, ref usableFragmentCount, Bins, ref binIndexStart);
                    }
                }
                if (pairedAlignmentCount == 0)
                {
                    throw new ApplicationException(string.Format("No paired alignments found for {0} in {1}", Chromosome, Bam.BamFile.FullName));
                }
            }