Пример #1
0
        /// <summary>
        /// Sets up two Dictionaries holding BitArrays, one BitArray for each chromosome in a fasta file. One bit for each nucleotide.
        /// </summary>
        /// <param name="fastaFile">Fasta file containing uniquemer-marked reference genome.</param>
        /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param>
        /// <param name="observedAlignments">Stores observed alignments from a sample.</param>
        /// <param name="fragmentLengths">Stores fragment length (Int16).</param>
        static void InitializeAlignmentArrays(string fastaFile, string chromosome, CanvasCoverageMode coverageMode, IDictionary <string, BitArray> possibleAlignments, IDictionary <string, HitArray> observedAlignments, IDictionary <string, Int16[]> fragmentLengths)
        {
            string referenceBases = FastaLoader.LoadFastaSequence(fastaFile, chromosome);

            BitArray possible = new BitArray(referenceBases.Length);

            possibleAlignments[chromosome] = possible;
            observedAlignments[chromosome] = new HitArray(referenceBases.Length);
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                fragmentLengths[chromosome] = new Int16[referenceBases.Length];
            }
            else
            {
                fragmentLengths[chromosome] = new Int16[0];
            }
            // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters.
            for (int i = 0; i < referenceBases.Length; i++)
            {
                if (char.IsUpper(referenceBases[i]))
                {
                    possible[i] = true;
                }
            }
        }
Пример #2
0
        /// <summary>
        /// Calculates how many possible alignments corresponds to the desired number of observed alignments per bin.
        /// </summary>
        /// <param name="countsPerBin">Desired number of observed alignments per bin.</param>
        /// <param name="possibleAlignments">BitArrays of possible alignments (unique mers).</param>
        /// <param name="observedAlignments">BitArrays storing the observed alignments.</param>
        /// <returns>Median alignment rate observed on the autosomes.</returns>
        static int CalculateNumberOfPossibleAlignmentsPerBin(int countsPerBin, Dictionary <string, BitArray> possibleAlignments,
                                                             Dictionary <string, HitArray> observedAlignments, NexteraManifest manifest = null)
        {
            List <double> rates = new List <double>();

            Dictionary <string, List <NexteraManifest.ManifestRegion> > manifestRegionsByChrom = null;

            if (manifest != null)
            {
                manifestRegionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            List <ThreadStart> tasks = new List <ThreadStart>();

            foreach (string chr in possibleAlignments.Keys)
            {
                // We don't want to include the sex chromosomes because they may not be copy number 2
                if (!GenomeMetadata.SequenceMetadata.IsAutosome(chr))
                {
                    continue;
                }
                HitArray observed = observedAlignments[chr];
                BitArray possible = possibleAlignments[chr];
                List <NexteraManifest.ManifestRegion> regions = null;
                if (manifestRegionsByChrom != null)
                {
                    if (!manifestRegionsByChrom.ContainsKey(chr))
                    {
                        continue;
                    }
                    regions = manifestRegionsByChrom[chr];
                }
                tasks.Add(new ThreadStart(() =>
                {
                    int numberObserved = observed.CountSetBits(regions);
                    int numberPossible = CountSetBits(possible, regions);

                    double rate = numberObserved / (double)numberPossible;

                    lock (rates)
                    {
                        rates.Add(rate);
                    }
                }));
            }

            Console.WriteLine("Launch CalculateNumberOfPossibleAlignmentsPerBin jobs...");
            Console.Out.WriteLine();
            //Parallel.ForEach(tasks, t => { t.Invoke(); }); //todo allow controling degree of parallelism
            Isas.Shared.Utilities.DoWorkParallelThreads(tasks);
            Console.WriteLine("CalculateNumberOfPossibleAlignmentsPerBin jobs complete.");
            Console.Out.WriteLine();
            double medianRate = CanvasCommon.Utilities.Median(rates);

            return((int)(countsPerBin / medianRate));
        }
Пример #3
0
 public void GetData(out Dictionary <string, BitArray> possibleAlignments, out Dictionary <string, HitArray> observedAlignments, out Dictionary <string, Int16[]> fragmentLengths)
 {
     possibleAlignments = Convert(this.PossibleAlignments, this.BitsInLastBytePossibleAlignments);
     observedAlignments = new Dictionary <string, HitArray>();
     foreach (string key in this.ObservedAlignments.Keys)
     {
         observedAlignments[key] = new HitArray(this.ObservedAlignments[key]);
     }
     fragmentLengths = new Dictionary <string, Int16[]>();
     foreach (string key in this.FragmentLengths.Keys)
     {
         fragmentLengths[key] = this.FragmentLengths[key];
     }
 }
Пример #4
0
 /// <summary>
 /// Remove any observed alignment if it wasn't 'possible'.
 /// </summary>
 /// <param name="observedAlignments">BitArrays of observed alignment positions.</param>
 /// <param name="possibleAlignments">BitArrays of possible alignment positions.</param>
 static void ScreenObservedTags(IDictionary <string, HitArray> observedAlignments, IDictionary <string, BitArray> possibleAlignments)
 {
     foreach (string chr in possibleAlignments.Keys)
     {
         if (!observedAlignments.ContainsKey(chr))
         {
             continue;
         }
         HitArray observed = observedAlignments[chr];
         BitArray possible = possibleAlignments[chr];
         for (int i = 0; i < possible.Length; i++)
         {
             if (!possible[i])
             {
                 observed.Data[i] = 0;
             }
         }
     }
 }
Пример #5
0
        /// <summary>
        /// Populate the list of GenomicBin objects for this chromosome.
        /// </summary>
        static void BinCountsForChromosome(BinTaskArguments arguments)
        {
            List <GenomicBin> bins = arguments.Bins;
            bool               usePredefinedBins  = bins.Any();
            int                predefinedBinIndex = 0;
            GenericRead        fastaEntry         = arguments.FastaEntry; //fastaEntryKVP.Value;
            BinState           currentBin         = new BinState();
            string             chr = arguments.Chromosome;
            BitArray           possibleAlignments = arguments.PossibleAlignments;
            HitArray           observedAlignments = arguments.ObservedAlignments;
            CanvasCoverageMode coverageMode       = arguments.CoverageMode;
            int                pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0;

            // Skip past leading Ns
            while (fastaEntry.Bases[pos].Equals('n'))
            {
                pos++;
            }
            List <float> binPositions    = new List <float>();
            List <int>   binObservations = new List <int>();

            for (; pos < fastaEntry.Bases.Length; pos++)
            {
                // Sets the start of the bin
                if (currentBin.StartPosition == -1)
                {
                    currentBin.StartPosition = pos;
                }

                if (!fastaEntry.Bases[pos].Equals("n"))
                {
                    currentBin.NucleotideCount++;
                }


                //if (Utilities.IsGC(fastaEntry.Bases[pos]))
                //    currentBin.GCCount++;
                switch (fastaEntry.Bases[pos])
                {
                case 'C':
                case 'c':
                case 'G':
                case 'g':
                    currentBin.GCCount++;
                    break;
                }

                if (possibleAlignments[pos])
                {
                    currentBin.PossibleCount++;
                    currentBin.ObservedCount += observedAlignments.Data[pos];
                    binObservations.Add(observedAlignments.Data[pos]);
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                    {
                        binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]);
                    }
                }

                // We've seen the desired number of possible alignment positions.
                if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize) ||
                    (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1))
                {
                    if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range
                    {
                        currentBin.ObservedCount = 0;
                        foreach (int Value in binObservations)
                        {
                            currentBin.ObservedCount += Math.Min(10, Value);
                        }
                    }
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted
                    {
                        currentBin.ObservedCount = 0;
                        float tmpObservedCount = 0;
                        for (int i = 0; i < binObservations.Count; i++)
                        {
                            tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]);
                        }
                        currentBin.ObservedCount = (int)Math.Round(tmpObservedCount);
                    }

                    int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount);

                    if (usePredefinedBins)
                    {
                        bins[predefinedBinIndex].GC    = gc;
                        bins[predefinedBinIndex].Count = currentBin.ObservedCount;
                        predefinedBinIndex++;
                        if (predefinedBinIndex >= bins.Count)
                        {
                            break;
                        }                                         // we have processed all the bins
                        pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin
                    }
                    else
                    {
                        // Note the pos + 1 to make the first three conform to bed specification
                        GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount);
                        bins.Add(bin);
                    }

                    // Reset all relevant variables
                    currentBin.Reset();
                    binObservations.Clear();
                    binPositions.Clear();
                }
            }
        }
Пример #6
0
        /// <summary>
        /// Reads in a bam file and marks within the BitArrays which genomic mers are present.
        /// </summary>
        /// <param name="bamFile">bam file read alignments from.</param>
        /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param>
        static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths)
        {
            // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome!
            string indexPath = bamFile + ".bai";

            if (!File.Exists(indexPath))
            {
                throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath));
            }

            using (BamReader reader = new BamReader(bamFile))
            {
                int desiredRefIndex = -1;
                desiredRefIndex = reader.GetReferenceIndex(chromosome);
                if (desiredRefIndex == -1)
                {
                    throw new ApplicationException(
                              string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome,
                                            bamFile));
                }
                bool result = reader.Jump(desiredRefIndex, 0);
                if (!result)
                {
                    // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this
                    // .bam file.  That is not uncommon e.g. for truseq amplicon.
                    return;
                }
                int          readCount     = 0;
                int          keptReadCount = 0;
                string       header        = reader.GetHeader();
                BamAlignment alignment     = new BamAlignment();
                while (reader.GetNextAlignment(ref alignment, true))
                {
                    readCount++;

                    // Flag check - Require reads to be aligned, passing filter, non-duplicate:
                    if (!alignment.IsMapped())
                    {
                        continue;
                    }
                    if (alignment.IsFailedQC())
                    {
                        continue;
                    }
                    if (alignment.IsDuplicate())
                    {
                        continue;
                    }
                    if (alignment.IsReverseStrand())
                    {
                        continue;
                    }
                    if (!alignment.IsMainAlignment())
                    {
                        continue;
                    }

                    // Require the alignment to start with 35 bases of non-indel:
                    if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35)
                    {
                        continue;
                    }

                    if (isPairedEnd && !alignment.IsProperPair())
                    {
                        continue;
                    }

                    int refID = alignment.RefID;

                    // quit if the current reference index is different from the desired reference index
                    if (refID != desiredRefIndex)
                    {
                        break;
                    }

                    if (refID == -1)
                    {
                        continue;
                    }

                    keptReadCount++;
                    if (coverageMode == CanvasCoverageMode.Binary)
                    {
                        observed.Data[alignment.Position] = 1;
                    }
                    else
                    {
                        observed.Set(alignment.Position);
                    }
                    // store fragment size, make sure it's within Int16 range and is positive (simplification for now)
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                    {
                        fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0));
                    }
                }
                Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount);
            }
        }
Пример #7
0
 public void GetData(out Dictionary<string, BitArray> possibleAlignments, out Dictionary<string, HitArray> observedAlignments, out Dictionary<string, Int16[]> fragmentLengths)
 {
     possibleAlignments = Convert(this.PossibleAlignments, this.BitsInLastBytePossibleAlignments);
     observedAlignments = new Dictionary<string, HitArray>();
     foreach (string key in this.ObservedAlignments.Keys)
     {
         observedAlignments[key] = new HitArray(this.ObservedAlignments[key]);
     }
     fragmentLengths = new Dictionary<string, Int16[]>();
     foreach (string key in this.FragmentLengths.Keys)
     {
         fragmentLengths[key] = this.FragmentLengths[key];
     }
 }
Пример #8
0
        /// <summary>
        /// Reads in a bam file and marks within the BitArrays which genomic mers are present.
        /// </summary>
        /// <param name="bamFile">bam file read alignments from.</param>
        /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param>
        static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths)
        {
            // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome!
            string indexPath = bamFile + ".bai";
            if (!File.Exists(indexPath))
            {
                throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath));
            }

            using (BamReader reader = new BamReader(bamFile))
            {
                int desiredRefIndex = -1;
                desiredRefIndex = reader.GetReferenceIndex(chromosome);
                if (desiredRefIndex == -1)
                {
                    throw new ApplicationException(
                        string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome,
                        bamFile));
                }
                bool result = reader.Jump(desiredRefIndex, 0);
                if (!result)
                {
                    // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this 
                    // .bam file.  That is not uncommon e.g. for truseq amplicon.
                    return;
                }
                int readCount = 0;
                int keptReadCount = 0;
                string header = reader.GetHeader();
                BamAlignment alignment = new BamAlignment();
                while (reader.GetNextAlignment(ref alignment, true))
                {
                    readCount++;

                    // Flag check - Require reads to be aligned, passing filter, non-duplicate:
                    if (!alignment.IsMapped()) continue;
                    if (alignment.IsFailedQC()) continue;
                    if (alignment.IsDuplicate()) continue;
                    if (alignment.IsReverseStrand()) continue;
                    if (!alignment.IsMainAlignment()) continue;

                    // Require the alignment to start with 35 bases of non-indel:
                    if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) continue;

                    if (isPairedEnd && !alignment.IsProperPair()) continue;

                    int refID = alignment.RefID;

                    // quit if the current reference index is different from the desired reference index
                    if (refID != desiredRefIndex)
                        break;

                    if (refID == -1)
                        continue;

                    keptReadCount++;
                    if (coverageMode == CanvasCoverageMode.Binary)
                    {
                        observed.Data[alignment.Position] = 1;
                    }
                    else
                    {
                        observed.Set(alignment.Position);
                    }
                    // store fragment size, make sure it's within Int16 range and is positive (simplification for now)
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                        fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0));
                }
                Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount);
            }
        }
Пример #9
0
        /// <summary>
        /// Sets up two Dictionaries holding BitArrays, one BitArray for each chromosome in a fasta file. One bit for each nucleotide.
        /// </summary>
        /// <param name="fastaFile">Fasta file containing uniquemer-marked reference genome.</param>
        /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param>
        /// <param name="observedAlignments">Stores observed alignments from a sample.</param>
        /// <param name="fragmentLengths">Stores fragment length (Int16).</param>
        static void InitializeAlignmentArrays(string fastaFile, string chromosome, CanvasCoverageMode coverageMode, IDictionary<string, BitArray> possibleAlignments, IDictionary<string, HitArray> observedAlignments, IDictionary<string, Int16[]> fragmentLengths)
        {
            string referenceBases = FastaLoader.LoadFastaSequence(fastaFile, chromosome);

            BitArray possible = new BitArray(referenceBases.Length);
            possibleAlignments[chromosome] = possible;
            observedAlignments[chromosome] = new HitArray(referenceBases.Length);
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                fragmentLengths[chromosome] = new Int16[referenceBases.Length];
            else
                fragmentLengths[chromosome] = new Int16[0];
            // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters.
            for (int i = 0; i < referenceBases.Length; i++)
            {
                if (char.IsUpper(referenceBases[i]))
                    possible[i] = true;
            }
        }