Пример #1
0
        private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List <double> binCounts,
                                          out List <int> onTargetIndices)
        {
            binCounts       = new List <double>();
            onTargetIndices = new List <int>();

            var    regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom      = null;
            List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int  regionIndex = -1;
            bool onTarget    = false;

            using (GzipReader reader = new GzipReader(binnedPath))
            {
                string   line;
                string[] toks;
                int      binIdx = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    toks = line.Split('\t');
                    string chrom = toks[0];
                    int    start = int.Parse(toks[1]); // 0-based, inclusive
                    int    stop  = int.Parse(toks[2]); // 0-based, exclusive
                    if (currChrom != chrom)
                    {
                        currChrom = chrom;
                        onTarget  = false;
                        if (!regionsByChrom.ContainsKey(currChrom))
                        {
                            regions = null;
                        }
                        else
                        {
                            regions     = regionsByChrom[currChrom];
                            regionIndex = 0;
                        }
                    }
                    while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < start + 1)
                    {
                        regionIndex++;
                    }
                    if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= stop) // overlap
                    {
                        onTarget = true;
                    }
                    else
                    {
                        onTarget = false;
                    }

                    if (onTarget)
                    {
                        onTargetIndices.Add(binIdx);
                    }

                    binCounts.Add(double.Parse(toks[3]));
                    binIdx++;
                }
            }
        }
Пример #2
0
        /// <summary>
        /// Calculates how many possible alignments corresponds to the desired number of observed alignments per bin.
        /// </summary>
        /// <param name="countsPerBin">Desired number of observed alignments per bin.</param>
        /// <param name="possibleAlignments">BitArrays of possible alignments (unique mers).</param>
        /// <param name="observedAlignments">BitArrays storing the observed alignments.</param>
        /// <returns>Median alignment rate observed on the autosomes.</returns>
        static int CalculateNumberOfPossibleAlignmentsPerBin(int countsPerBin, Dictionary <string, BitArray> possibleAlignments,
                                                             Dictionary <string, HitArray> observedAlignments, NexteraManifest manifest = null)
        {
            List <double> rates = new List <double>();

            Dictionary <string, List <NexteraManifest.ManifestRegion> > manifestRegionsByChrom = null;

            if (manifest != null)
            {
                manifestRegionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            List <ThreadStart> tasks = new List <ThreadStart>();

            foreach (string chr in possibleAlignments.Keys)
            {
                // We don't want to include the sex chromosomes because they may not be copy number 2
                if (!GenomeMetadata.SequenceMetadata.IsAutosome(chr))
                {
                    continue;
                }
                HitArray observed = observedAlignments[chr];
                BitArray possible = possibleAlignments[chr];
                List <NexteraManifest.ManifestRegion> regions = null;
                if (manifestRegionsByChrom != null)
                {
                    if (!manifestRegionsByChrom.ContainsKey(chr))
                    {
                        continue;
                    }
                    regions = manifestRegionsByChrom[chr];
                }
                tasks.Add(new ThreadStart(() =>
                {
                    int numberObserved = observed.CountSetBits(regions);
                    int numberPossible = CountSetBits(possible, regions);

                    double rate = numberObserved / (double)numberPossible;

                    lock (rates)
                    {
                        rates.Add(rate);
                    }
                }));
            }

            Console.WriteLine("Launch CalculateNumberOfPossibleAlignmentsPerBin jobs...");
            Console.Out.WriteLine();
            //Parallel.ForEach(tasks, t => { t.Invoke(); }); //todo allow controling degree of parallelism
            Isas.Shared.Utilities.DoWorkParallelThreads(tasks);
            Console.WriteLine("CalculateNumberOfPossibleAlignmentsPerBin jobs complete.");
            Console.Out.WriteLine();
            double medianRate = CanvasCommon.Utilities.Median(rates);

            return((int)(countsPerBin / medianRate));
        }
Пример #3
0
        private static void LoadBinCounts(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest,
                                          out List <double> binCounts, out List <int> onTargetIndices)
        {
            binCounts       = new List <double>();
            onTargetIndices = new List <int>();

            var    regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom      = null;
            List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int  regionIndex = -1;
            bool onTarget    = false;
            int  binIdx      = 0;

            foreach (var bin in bins)
            {
                if (currChrom != bin.GenomicBin.Chromosome)
                {
                    currChrom = bin.GenomicBin.Chromosome;
                    onTarget  = false;
                    if (!regionsByChrom.ContainsKey(currChrom))
                    {
                        regions = null;
                    }
                    else
                    {
                        regions     = regionsByChrom[currChrom];
                        regionIndex = 0;
                    }
                }
                while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < bin.Start + 1)
                {
                    regionIndex++;
                }
                if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= bin.Stop) // overlap
                {
                    onTarget = true;
                }
                else
                {
                    onTarget = false;
                }

                if (onTarget)
                {
                    onTargetIndices.Add(binIdx);
                }

                binCounts.Add(bin.Count);
                binIdx++;
            }
        }
Пример #4
0
        /// <summary>
        /// Get the on-target bins by intersecting the manifest.
        /// </summary>
        /// <param name="bins"></param>
        /// <param name="manifest"></param>
        /// <returns></returns>
        public static IEnumerable <SampleGenomicBin> GetOnTargetBins(IEnumerable <SampleGenomicBin> bins, NexteraManifest manifest)
        {
            var    regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom      = null;
            List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int  regionIndex = -1;
            bool offTarget   = true;

            foreach (SampleGenomicBin bin in bins) // 0-based bins
            {
                if (currChrom != bin.GenomicBin.Chromosome)
                {
                    currChrom = bin.GenomicBin.Chromosome;
                    offTarget = true;
                    if (!regionsByChrom.ContainsKey(currChrom))
                    {
                        regions = null;
                    }
                    else
                    {
                        regions     = regionsByChrom[currChrom];
                        regionIndex = 0;
                    }
                }
                while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < bin.Start + 1)
                {
                    regionIndex++;
                }
                if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= bin.Stop) // overlap
                {
                    offTarget = false;
                }
                else
                {
                    offTarget = true;
                }

                if (offTarget)
                {
                    continue;
                }                            // ignore off-target bins

                yield return(bin);
            }
        }
Пример #5
0
        /// <summary>
        /// Get the on-target bins by intersecting the manifest.
        /// </summary>
        /// <param name="bins"></param>
        /// <param name="manifest"></param>
        /// <returns></returns>
        public static IEnumerable<GenomicBin> GetOnTargetBins(IEnumerable<GenomicBin> bins, NexteraManifest manifest)
        {
            var regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom = null;
            List<NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int regionIndex = -1;
            bool offTarget = true;
            foreach (GenomicBin bin in bins) // 0-based bins
            {
                if (currChrom != bin.Chromosome)
                {
                    currChrom = bin.Chromosome;
                    offTarget = true;
                    if (!regionsByChrom.ContainsKey(currChrom))
                    {
                        regions = null;
                    }
                    else
                    {
                        regions = regionsByChrom[currChrom];
                        regionIndex = 0;
                    }
                }
                while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < bin.Start + 1)
                {
                    regionIndex++;
                }
                if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= bin.Stop) // overlap
                {
                    offTarget = false;
                }
                else
                {
                    offTarget = true;
                }

                if (offTarget) { continue; } // ignore off-target bins

                yield return bin;
            }
        }
Пример #6
0
        /// <summary>
        /// Computes fragment-based GC normalization correction factor
        /// </summary>
        /// <returns>An array of observed vs expected GC counts.</returns>
        static float[] ComputeObservedVsExpectedGC(Dictionary <string, HitArray> observedAlignments,
                                                   Dictionary <string, byte[]> readGCContent, NexteraManifest manifest,
                                                   bool debugGC, string outFile)
        {
            Dictionary <string, List <NexteraManifest.ManifestRegion> > regionsByChrom = null;

            if (manifest != null)
            {
                regionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            long[] expectedReadCountsByGC = new long[numberOfGCbins];
            long[] observedReadCountsByGC = new long[numberOfGCbins];
            foreach (KeyValuePair <string, byte[]> chromosomeReadGCContent in readGCContent)
            {
                string chr = chromosomeReadGCContent.Key;
                if (!observedAlignments.ContainsKey(chr))
                {
                    continue;
                }

                if (manifest == null) // look at the entire genome
                {
                    for (int i = 0; i < chromosomeReadGCContent.Value.Length; i++)
                    {
                        expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                        observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                    }
                }
                else // look at only the targeted regions
                {
                    if (!regionsByChrom.ContainsKey(chr))
                    {
                        continue;
                    }
                    int i = -1;
                    foreach (var region in regionsByChrom[chr])
                    {
                        if (i < region.Start)     // avoid overlapping targeted regions
                        {
                            i = region.Start - 1; // i is 0-based; manifest coordinates are 1-based.
                        }
                        for (; i < chromosomeReadGCContent.Value.Length && i < region.End; i++)
                        {
                            expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                            observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                        }
                    }
                }
            }

            // calculate ratio of observed to expected read counts for each read GC bin
            float[] observedVsExpectedGC = new float[numberOfGCbins];
            for (int i = 0; i < numberOfGCbins; i++)
            {
                observedVsExpectedGC[i] = 1;
            }
            long sumObserved = 0;
            long sumExpected = 0;

            foreach (long gcContent in observedReadCountsByGC)
            {
                sumObserved += gcContent;
            }
            foreach (long gcContent in expectedReadCountsByGC)
            {
                sumExpected += gcContent;
            }
            for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
            {
                if (expectedReadCountsByGC[binIndex] == 0)
                {
                    expectedReadCountsByGC[binIndex] = 1;
                }
                if (observedReadCountsByGC[binIndex] == 0)
                {
                    observedReadCountsByGC[binIndex] = 1;
                }
                observedVsExpectedGC[binIndex] = ((float)observedReadCountsByGC[binIndex] / (float)expectedReadCountsByGC[binIndex]) * ((float)sumExpected / (float)sumObserved);
            }

            if (debugGC)
            {
                using (GzipWriter writer = new GzipWriter(outFile + ".gcstat"))
                {
                    for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
                    {
                        writer.WriteLine(string.Format("{0}\t{1}\t{2}", expectedReadCountsByGC[binIndex], observedReadCountsByGC[binIndex], observedVsExpectedGC[binIndex]));
                    }
                }
            }
            return(observedVsExpectedGC);
        }
Пример #7
0
        private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List<double> binCounts,
            out List<int> onTargetIndices)
        {
            binCounts = new List<double>();
            onTargetIndices = new List<int>();

            var regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom = null;
            List<NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int regionIndex = -1;
            bool onTarget = false;
            using (GzipReader reader = new GzipReader(binnedPath))
            {
                string line;
                string[] toks;
                int binIdx = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    toks = line.Split('\t');
                    string chrom = toks[0];
                    int start = int.Parse(toks[1]); // 0-based, inclusive
                    int stop = int.Parse(toks[2]); // 0-based, exclusive
                    if (currChrom != chrom)
                    {
                        currChrom = chrom;
                        onTarget = false;
                        if (!regionsByChrom.ContainsKey(currChrom))
                        {
                            regions = null;
                        }
                        else
                        {
                            regions = regionsByChrom[currChrom];
                            regionIndex = 0;
                        }
                    }
                    while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < start + 1)
                    {
                        regionIndex++;
                    }
                    if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= stop) // overlap
                    {
                        onTarget = true;
                    }
                    else
                    {
                        onTarget = false;
                    }

                    if (onTarget) { onTargetIndices.Add(binIdx); }

                    binCounts.Add(double.Parse(toks[3]));
                    binIdx++;
                }
            }
        }
Пример #8
0
        /// <summary>
        /// Computes fragment-based GC normalization correction factor 
        /// </summary>
        /// <returns>An array of observed vs expected GC counts.</returns>
        static float[] ComputeObservedVsExpectedGC(Dictionary<string, HitArray> observedAlignments,
            Dictionary<string, byte[]> readGCContent, NexteraManifest manifest,
            bool debugGC, string outFile)
        {

            Dictionary<string, List<NexteraManifest.ManifestRegion>> regionsByChrom = null;
            if (manifest != null)
            {
                regionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            long[] expectedReadCountsByGC = new long[numberOfGCbins];
            long[] observedReadCountsByGC = new long[numberOfGCbins];
            foreach (KeyValuePair<string, byte[]> chromosomeReadGCContent in readGCContent)
            {
                string chr = chromosomeReadGCContent.Key;
                if (!observedAlignments.ContainsKey(chr)) { continue; }

                if (manifest == null) // look at the entire genome
                {
                    for (int i = 0; i < chromosomeReadGCContent.Value.Length; i++)
                    {
                        expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                        observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                    }
                }
                else // look at only the targeted regions
                {
                    if (!regionsByChrom.ContainsKey(chr)) { continue; }
                    int i = -1;
                    foreach (var region in regionsByChrom[chr])
                    {
                        if (i < region.Start) // avoid overlapping targeted regions
                        {
                            i = region.Start - 1; // i is 0-based; manifest coordinates are 1-based.
                        }
                        for (; i < chromosomeReadGCContent.Value.Length && i < region.End; i++)
                        {
                            expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                            observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                        }
                    }
                }
            }

            // calculate ratio of observed to expected read counts for each read GC bin
            float[] observedVsExpectedGC = new float[numberOfGCbins];
            for (int i = 0; i < numberOfGCbins; i++)
                observedVsExpectedGC[i] = 1;
            long sumObserved = 0;
            long sumExpected = 0;
            foreach (long gcContent in observedReadCountsByGC)
                sumObserved += gcContent;
            foreach (long gcContent in expectedReadCountsByGC)
                sumExpected += gcContent;
            for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
            {
                if (expectedReadCountsByGC[binIndex] == 0)
                    expectedReadCountsByGC[binIndex] = 1;
                if (observedReadCountsByGC[binIndex] == 0)
                    observedReadCountsByGC[binIndex] = 1;
                observedVsExpectedGC[binIndex] = ((float)observedReadCountsByGC[binIndex] / (float)expectedReadCountsByGC[binIndex]) * ((float)sumExpected / (float)sumObserved);
            }

            if (debugGC)
            {
                using (GzipWriter writer = new GzipWriter(outFile + ".gcstat"))
                {
                    for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
                    {
                        writer.WriteLine(string.Format("{0}\t{1}\t{2}", expectedReadCountsByGC[binIndex], observedReadCountsByGC[binIndex], observedVsExpectedGC[binIndex]));
                    }
                }
            }
            return observedVsExpectedGC;
        }
Пример #9
0
        /// <summary>
        /// Calculates how many possible alignments corresponds to the desired number of observed alignments per bin.
        /// </summary>
        /// <param name="countsPerBin">Desired number of observed alignments per bin.</param>
        /// <param name="possibleAlignments">BitArrays of possible alignments (unique mers).</param>
        /// <param name="observedAlignments">BitArrays storing the observed alignments.</param>
        /// <returns>Median alignment rate observed on the autosomes.</returns>
        static int CalculateNumberOfPossibleAlignmentsPerBin(int countsPerBin, Dictionary<string, BitArray> possibleAlignments,
            Dictionary<string, HitArray> observedAlignments, NexteraManifest manifest = null)
        {
            List<double> rates = new List<double>();

            Dictionary<string, List<NexteraManifest.ManifestRegion>> manifestRegionsByChrom = null;
            if (manifest != null)
            {
                manifestRegionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            List<ThreadStart> tasks = new List<ThreadStart>();
            foreach (string chr in possibleAlignments.Keys)
            {
                // We don't want to include the sex chromosomes because they may not be copy number 2
                if (!GenomeMetadata.SequenceMetadata.IsAutosome(chr))
                    continue;
                HitArray observed = observedAlignments[chr];
                BitArray possible = possibleAlignments[chr];
                List<NexteraManifest.ManifestRegion> regions = null;
                if (manifestRegionsByChrom != null)
                {
                    if (!manifestRegionsByChrom.ContainsKey(chr)) { continue; }
                    regions = manifestRegionsByChrom[chr];
                }
                tasks.Add(new ThreadStart(() =>
                {
                    int numberObserved = observed.CountSetBits(regions);
                    int numberPossible = CountSetBits(possible, regions);

                    double rate = numberObserved / (double)numberPossible;

                    lock (rates)
                    {
                        rates.Add(rate);
                    }

                }));
            }

            Console.WriteLine("Launch CalculateNumberOfPossibleAlignmentsPerBin jobs...");
            Console.Out.WriteLine();
            //Parallel.ForEach(tasks, t => { t.Invoke(); }); //todo allow controling degree of parallelism
            Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(tasks);
            Console.WriteLine("CalculateNumberOfPossibleAlignmentsPerBin jobs complete.");
            Console.Out.WriteLine();
            double medianRate = CanvasCommon.Utilities.Median(rates);
            return (int)(countsPerBin / medianRate);
        }