示例#1
0
        /// <summary>
        /// Sets up two Dictionaries holding BitArrays, one BitArray for each chromosome in a fasta file. One bit for each nucleotide.
        /// </summary>
        /// <param name="fastaFile">Fasta file containing uniquemer-marked reference genome.</param>
        /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param>
        /// <param name="observedAlignments">Stores observed alignments from a sample.</param>
        /// <param name="fragmentLengths">Stores fragment length (Int16).</param>
        static void InitializeAlignmentArrays(string fastaFile, string chromosome, CanvasCoverageMode coverageMode, IDictionary <string, BitArray> possibleAlignments, IDictionary <string, HitArray> observedAlignments, IDictionary <string, Int16[]> fragmentLengths)
        {
            string referenceBases = FastaLoader.LoadFastaSequence(fastaFile, chromosome);

            BitArray possible = new BitArray(referenceBases.Length);

            possibleAlignments[chromosome] = possible;
            observedAlignments[chromosome] = new HitArray(referenceBases.Length);
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                fragmentLengths[chromosome] = new Int16[referenceBases.Length];
            }
            else
            {
                fragmentLengths[chromosome] = new Int16[0];
            }
            // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters.
            for (int i = 0; i < referenceBases.Length; i++)
            {
                if (char.IsUpper(referenceBases[i]))
                {
                    possible[i] = true;
                }
            }
        }
示例#2
0
        public CanvasRunner Create(bool isSomatic, CanvasCoverageMode coverageMode,
                                   int countsPerBin, Dictionary <string, string> customParameters)
        {
            var settings     = IsasConfigurationSettings.GetConfigSettings();
            var canvasFolder = new DirectoryLocation(Isas.Framework.Utilities.Utilities.GetAssemblyFolder(typeof(CanvasRunner)));

            var commandManager        = new CommandManager(new ExecutableProcessor(settings, _logger, canvasFolder));
            var tabixWrapper          = TabixWrapperFactory.GetTabixWrapper(_logger, _workDoer, commandManager);
            var bAlleleBedGraphWriter = new BAlleleBedGraphWriter(new BgzfBedGraphWriter(new RoundingBedGraphWriter(new BedGraphWriterFacade(), 4), tabixWrapper));

            return(new CanvasRunner(_logger, _workDoer, _checkpointRunner, _runtimeExecutable, _runtimeCommandPrefix, isSomatic, coverageMode, countsPerBin, bAlleleBedGraphWriter, customParameters, canvasFolder.FullName));
        }
示例#3
0
 public CanvasRunner(ILogger logger, IWorkManager workManager, ICheckpointRunner checkpointRunner, bool isSomatic, CanvasCoverageMode coverageMode,
     int countsPerBin, Dictionary<string, string> customParameters = null)
 {
     _logger = logger;
     _workManager = workManager;
     _checkpointRunner = checkpointRunner;
     _isSomatic = isSomatic;
     _canvasFolder = Path.Combine(Utilities.GetAssemblyFolder(typeof(CanvasRunner)));
     _coverageMode = coverageMode;
     _countsPerBin = countsPerBin;
     if (customParameters != null) { _customParameters = customParameters; }
 }
示例#4
0
 public CanvasRunner(ILogger logger, IWorkManager workManager, ICheckpointRunnerAsync checkpointRunner, bool isSomatic, CanvasCoverageMode coverageMode,
                     int countsPerBin, Dictionary <string, string> customParameters = null)
 {
     _logger           = logger;
     _workManager      = workManager;
     _checkpointRunner = checkpointRunner;
     _isSomatic        = isSomatic;
     _canvasFolder     = Path.Combine(Utilities.GetAssemblyFolder(typeof(CanvasRunner)));
     _coverageMode     = coverageMode;
     _countsPerBin     = countsPerBin;
     if (customParameters != null)
     {
         _customParameters = new Dictionary <string, string>(customParameters, StringComparer.InvariantCultureIgnoreCase);
     }
 }
示例#5
0
        public void GetExecutablePath_test()
        {
            var logger           = Substitute.For <ILogger>();
            var workDoer         = Substitute.For <IWorkDoer>();
            var checkpointRunner = Substitute.For <ICheckpointRunner>();
            Func <string, ICommandFactory> runtimePrefix = component => Substitute.For <ICommandFactory>();
            string dotnetPath            = @"C:\path\to\dotnet.exe";
            var    runtimeExecutable     = new FileLocation(dotnetPath);
            bool   isSomatic             = true;
            var    coverageMode          = new CanvasCoverageMode();
            int    countsPerBin          = 0;
            string canvasFolder          = @"C:\path\to\Canvas\";
            var    bAlleleBedGraphWriter = Substitute.For <IBAlleleBedGraphWriter>();
            var    canvasRunner          = new CanvasRunner(logger, workDoer, checkpointRunner, runtimeExecutable, runtimePrefix, isSomatic, coverageMode, countsPerBin, bAlleleBedGraphWriter, null, canvasFolder);
            string prefix               = "something before ";
            var    commandLineBuilder   = new StringBuilder(prefix);
            string canvasExecutableStub = "CanvasBin";
            string fullName             = canvasRunner.GetExecutablePath(canvasExecutableStub, commandLineBuilder);

            Assert.Equal(@"C:\path\to\dotnet.exe", fullName);
            Assert.Equal(@"something before C:\path\to\Canvas\CanvasBin\CanvasBin.dll ", commandLineBuilder.ToString());
        }
示例#6
0
        /// <summary>
        /// Deserialize CanvasBin object in multiple threads
        /// </summary>
        /// <param name="inputFile">inputFile with per-chromosome CanvasBin objects.</param>
        /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param>
        /// <param name="observedAlignments">Stores observed alignments from a sample.</param>
        /// <param name="fragmentLengths">Stores fragment length in byte format.</param>
        public static void DeserializeCanvasData(string inputFile, Dictionary <string, BitArray> possibleAlignments,
                                                 Dictionary <string, HitArray> observedAlignments, Dictionary <string, Int16[]> fragmentLengths,
                                                 Object semaphore, CanvasCoverageMode coverageMode)
        {
            IntermediateData data = null;

            using (FileStream stream = new FileStream(inputFile, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stopwatch watch = new Stopwatch();
                watch.Start();
                data = ProtoBuf.Serializer.Deserialize <IntermediateData>(stream);
                watch.Stop();
                Console.WriteLine("File: {0}", inputFile);
                Console.WriteLine("Time elapsed: {0}", watch.Elapsed);
            }
            Dictionary <string, BitArray> tempPossibleAlignments;
            Dictionary <string, HitArray> tempObservedAlignments;
            Dictionary <string, Int16[]>  tempFragmentLengths;

            data.GetData(out tempPossibleAlignments, out tempObservedAlignments, out tempFragmentLengths);
            lock (semaphore)
            {
                foreach (KeyValuePair <string, BitArray> kvp in tempPossibleAlignments)
                {
                    possibleAlignments.Add(kvp.Key, kvp.Value);
                }
                foreach (KeyValuePair <string, HitArray> kvp in tempObservedAlignments)
                {
                    observedAlignments.Add(kvp.Key, kvp.Value);
                }
                if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                {
                    foreach (KeyValuePair <string, Int16[]> kvp in tempFragmentLengths)
                    {
                        fragmentLengths.Add(kvp.Key, kvp.Value);
                    }
                }
            }
        }
示例#7
0
        /// <summary>
        /// Deserialize CanvasBin object in multiple threads 
        /// </summary>
        /// <param name="inputFile">inputFile with per-chromosome CanvasBin objects.</param>
        /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param>
        /// <param name="observedAlignments">Stores observed alignments from a sample.</param>
        /// <param name="fragmentLengths">Stores fragment length in byte format.</param>
        public static void DeserializeCanvasData(string inputFile, Dictionary<string, BitArray> possibleAlignments,
            Dictionary<string, HitArray> observedAlignments, Dictionary<string, Int16[]> fragmentLengths,
            Object semaphore, CanvasCoverageMode coverageMode)
        {
            IntermediateData data = null;
            using (FileStream stream = new FileStream(inputFile, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stopwatch watch = new Stopwatch();
                watch.Start();
                data = ProtoBuf.Serializer.Deserialize<IntermediateData>(stream);
                watch.Stop();
                Console.WriteLine("File: {0}", inputFile);
                Console.WriteLine("Time elapsed: {0}", watch.Elapsed);
            }
            Dictionary<string, BitArray> tempPossibleAlignments;
            Dictionary<string, HitArray> tempObservedAlignments;
            Dictionary<string, Int16[]> tempFragmentLengths;
            data.GetData(out tempPossibleAlignments, out tempObservedAlignments, out tempFragmentLengths);
            lock (semaphore)
            {
                foreach (KeyValuePair<string, BitArray> kvp in tempPossibleAlignments)
                {
                    possibleAlignments.Add(kvp.Key, kvp.Value);
                }
                foreach (KeyValuePair<string, HitArray> kvp in tempObservedAlignments)
                {
                    observedAlignments.Add(kvp.Key, kvp.Value);
                }
                if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                {
                    foreach (KeyValuePair<string, Int16[]> kvp in tempFragmentLengths)
                    {
                        fragmentLengths.Add(kvp.Key, kvp.Value);
                    }
                }

            }
        }
示例#8
0
            public IntermediateData(Dictionary <string, BitArray> possibleAlignments, Dictionary <string, HitArray> observedAlignments, Dictionary <string, Int16[]> fragmentLengths, CanvasCoverageMode coverageMode)
            {
                foreach (KeyValuePair <string, BitArray> kvp in possibleAlignments)
                {
                    int    bitsInLastByte = kvp.Value.Length % 8;
                    byte[] bytes          = new byte[kvp.Value.Length / 8 + (bitsInLastByte == 0 ? 0 : 1)];
                    kvp.Value.CopyTo(bytes, 0);
                    this.PossibleAlignments[kvp.Key]          = bytes;
                    BitsInLastBytePossibleAlignments[kvp.Key] = bitsInLastByte;
                }

                foreach (KeyValuePair <string, HitArray> kvp in observedAlignments)
                {
                    this.ObservedAlignments[kvp.Key] = kvp.Value.Data;
                }

                if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                {
                    foreach (KeyValuePair <string, Int16[]> kvp in fragmentLengths)
                    {
                        this.FragmentLengths[kvp.Key] = kvp.Value;
                    }
                }
            }
示例#9
0
        /// <summary>
        /// Populate the list of GenomicBin objects for this chromosome.
        /// </summary>
        static void BinCountsForChromosome(BinTaskArguments arguments)
        {
            List <GenomicBin> bins = arguments.Bins;
            bool               usePredefinedBins  = bins.Any();
            int                predefinedBinIndex = 0;
            GenericRead        fastaEntry         = arguments.FastaEntry; //fastaEntryKVP.Value;
            BinState           currentBin         = new BinState();
            string             chr = arguments.Chromosome;
            BitArray           possibleAlignments = arguments.PossibleAlignments;
            HitArray           observedAlignments = arguments.ObservedAlignments;
            CanvasCoverageMode coverageMode       = arguments.CoverageMode;
            int                pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0;

            // Skip past leading Ns
            while (fastaEntry.Bases[pos].Equals('n'))
            {
                pos++;
            }
            List <float> binPositions    = new List <float>();
            List <int>   binObservations = new List <int>();

            for (; pos < fastaEntry.Bases.Length; pos++)
            {
                // Sets the start of the bin
                if (currentBin.StartPosition == -1)
                {
                    currentBin.StartPosition = pos;
                }

                if (!fastaEntry.Bases[pos].Equals("n"))
                {
                    currentBin.NucleotideCount++;
                }


                //if (Utilities.IsGC(fastaEntry.Bases[pos]))
                //    currentBin.GCCount++;
                switch (fastaEntry.Bases[pos])
                {
                case 'C':
                case 'c':
                case 'G':
                case 'g':
                    currentBin.GCCount++;
                    break;
                }

                if (possibleAlignments[pos])
                {
                    currentBin.PossibleCount++;
                    currentBin.ObservedCount += observedAlignments.Data[pos];
                    binObservations.Add(observedAlignments.Data[pos]);
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                    {
                        binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]);
                    }
                }

                // We've seen the desired number of possible alignment positions.
                if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize) ||
                    (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1))
                {
                    if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range
                    {
                        currentBin.ObservedCount = 0;
                        foreach (int Value in binObservations)
                        {
                            currentBin.ObservedCount += Math.Min(10, Value);
                        }
                    }
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted
                    {
                        currentBin.ObservedCount = 0;
                        float tmpObservedCount = 0;
                        for (int i = 0; i < binObservations.Count; i++)
                        {
                            tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]);
                        }
                        currentBin.ObservedCount = (int)Math.Round(tmpObservedCount);
                    }

                    int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount);

                    if (usePredefinedBins)
                    {
                        bins[predefinedBinIndex].GC    = gc;
                        bins[predefinedBinIndex].Count = currentBin.ObservedCount;
                        predefinedBinIndex++;
                        if (predefinedBinIndex >= bins.Count)
                        {
                            break;
                        }                                         // we have processed all the bins
                        pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin
                    }
                    else
                    {
                        // Note the pos + 1 to make the first three conform to bed specification
                        GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount);
                        bins.Add(bin);
                    }

                    // Reset all relevant variables
                    currentBin.Reset();
                    binObservations.Clear();
                    binPositions.Clear();
                }
            }
        }
示例#10
0
        /// <summary>
        /// Bin alignments.
        /// </summary>
        /// <param name="referenceFile">Reference fasta file.</param>
        /// <param name="binSize">Desired number of alignments per bin.</param>
        /// <param name="possibleAlignments">BitArrays of possible alignments.</param>
        /// <param name="observedAlignments">BitArrays of observed alignments.</param>
        /// <param name="predefinedBins">Pre-defined bins. null if not available.</param>
        /// <returns>A list of bins.</returns>
        static List <GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest,
                                           Dictionary <string, BitArray> possibleAlignments,
                                           Dictionary <string, HitArray> observedAlignments,
                                           Dictionary <string, Int16[]> fragmentLengths,
                                           Dictionary <string, List <GenomicBin> > predefinedBins,
                                           string outFile)
        {
            bool debugGCCorrection = false; // write value of GC bins and correction factor
            Dictionary <string, GenericRead> fastaEntries = new Dictionary <string, GenericRead>();
            List <string> chromosomes        = new List <string>();
            Int16         meanFragmentSize   = 0;
            Int16         meanFragmentCutoff = 3;

            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                meanFragmentSize = MeanFragmentSize(fragmentLengths);
            }

            using (FastaReader reader = new FastaReader(referenceFile))
            {
                GenericRead fastaEntry = new GenericRead();

                // Loop through each chromosome in the reference.
                while (reader.GetNextEntry(ref fastaEntry))
                {
                    chromosomes.Add(fastaEntry.Name);
                    fastaEntries[fastaEntry.Name] = fastaEntry;
                    fastaEntry = new GenericRead();
                }
            }

            // calculate GC content of the forward read at every position along the genome
            Dictionary <string, byte[]> readGCContent = new Dictionary <string, byte[]>();

            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                byte gcCap = (byte)numberOfGCbins;
                List <ThreadStart> normalizationTasks = new List <ThreadStart>();
                foreach (KeyValuePair <string, Int16[]> fragmentLengthsKVP in fragmentLengths)
                {
                    string      chr        = fragmentLengthsKVP.Key;
                    GenericRead fastaEntry = fastaEntries[chr];

                    normalizationTasks.Add(new ThreadStart(() =>
                    {
                        // contains GC content of the forward read at every position for current chr
                        byte[] gcContent = new byte[fastaEntry.Bases.Length];

                        uint gcCounter = 0;

                        // Iteratively calculate GC content of "reads" using fasta genome reference
                        for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++)
                        {
                            Int16 currentFragment = 0;

                            if (fragmentLengthsKVP.Value[pos] == 0)
                            {
                                currentFragment = meanFragmentSize;
                            }
                            else
                            {
                                currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff));
                            }
                            for (int i = pos; i < pos + currentFragment; i++)
                            {
                                switch (fastaEntry.Bases[i])
                                {
                                case 'C':
                                case 'c':
                                case 'G':
                                case 'g':
                                    gcCounter++;
                                    break;

                                default:
                                    break;
                                }
                            }
                            gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap);
                            gcCounter      = 0;
                        }
                        lock (readGCContent)
                        {
                            readGCContent[chr] = gcContent;
                        }
                    }));
                }

                Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now);
                Console.Out.Flush();
                Isas.Shared.Utilities.DoWorkParallelThreads(normalizationTasks);
                Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now);
                Console.Out.Flush();
            }

            // populate observed and expected read GC bin vectors
            float[] observedVsExpectedGC = new float[0];
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile);
            }

            Dictionary <string, List <GenomicBin> > perChromosomeBins = new Dictionary <string, List <GenomicBin> >();
            List <ThreadStart> binningTasks = new List <ThreadStart>();

            foreach (KeyValuePair <string, GenericRead> fastaEntryKVP in fastaEntries)
            {
                string chr = fastaEntryKVP.Key;
                if (!possibleAlignments.ContainsKey(chr))
                {
                    continue;
                }
                if (predefinedBins != null && !predefinedBins.ContainsKey(chr))
                {
                    continue;
                }

                BinTaskArguments args = new BinTaskArguments();
                args.FastaEntry         = fastaEntryKVP.Value;
                args.Chromosome         = chr;
                args.PossibleAlignments = possibleAlignments[chr];
                args.ObservedAlignments = observedAlignments[chr];
                args.CoverageMode       = coverageMode;
                perChromosomeBins[chr]  = predefinedBins == null ? new List <GenomicBin>() : predefinedBins[chr];
                args.Bins    = perChromosomeBins[chr];
                args.BinSize = binSize;
                if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                {
                    args.ReadGCContent = readGCContent[chr];
                }
                else
                {
                    args.ReadGCContent = null;
                }
                args.ObservedVsExpectedGC = observedVsExpectedGC;
                binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); }));
            }
            Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now);
            Console.Out.WriteLine();
            //Parallel.ForEach(binningTasks, t => { t.Invoke(); });
            Isas.Shared.Utilities.DoWorkParallelThreads(binningTasks);
            Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now);
            Console.Out.WriteLine();

            List <GenomicBin> finalBins = new List <GenomicBin>();

            foreach (string chr in chromosomes)
            {
                if (!perChromosomeBins.ContainsKey(chr))
                {
                    continue;
                }
                finalBins.AddRange(perChromosomeBins[chr]);
            }
            return(finalBins);
        }
示例#11
0
        /// <summary>
        /// Reads in a bam file and marks within the BitArrays which genomic mers are present.
        /// </summary>
        /// <param name="bamFile">bam file read alignments from.</param>
        /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param>
        static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths)
        {
            // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome!
            string indexPath = bamFile + ".bai";

            if (!File.Exists(indexPath))
            {
                throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath));
            }

            using (BamReader reader = new BamReader(bamFile))
            {
                int desiredRefIndex = -1;
                desiredRefIndex = reader.GetReferenceIndex(chromosome);
                if (desiredRefIndex == -1)
                {
                    throw new ApplicationException(
                              string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome,
                                            bamFile));
                }
                bool result = reader.Jump(desiredRefIndex, 0);
                if (!result)
                {
                    // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this
                    // .bam file.  That is not uncommon e.g. for truseq amplicon.
                    return;
                }
                int          readCount     = 0;
                int          keptReadCount = 0;
                string       header        = reader.GetHeader();
                BamAlignment alignment     = new BamAlignment();
                while (reader.GetNextAlignment(ref alignment, true))
                {
                    readCount++;

                    // Flag check - Require reads to be aligned, passing filter, non-duplicate:
                    if (!alignment.IsMapped())
                    {
                        continue;
                    }
                    if (alignment.IsFailedQC())
                    {
                        continue;
                    }
                    if (alignment.IsDuplicate())
                    {
                        continue;
                    }
                    if (alignment.IsReverseStrand())
                    {
                        continue;
                    }
                    if (!alignment.IsMainAlignment())
                    {
                        continue;
                    }

                    // Require the alignment to start with 35 bases of non-indel:
                    if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35)
                    {
                        continue;
                    }

                    if (isPairedEnd && !alignment.IsProperPair())
                    {
                        continue;
                    }

                    int refID = alignment.RefID;

                    // quit if the current reference index is different from the desired reference index
                    if (refID != desiredRefIndex)
                    {
                        break;
                    }

                    if (refID == -1)
                    {
                        continue;
                    }

                    keptReadCount++;
                    if (coverageMode == CanvasCoverageMode.Binary)
                    {
                        observed.Data[alignment.Position] = 1;
                    }
                    else
                    {
                        observed.Set(alignment.Position);
                    }
                    // store fragment size, make sure it's within Int16 range and is positive (simplification for now)
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                    {
                        fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0));
                    }
                }
                Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount);
            }
        }
示例#12
0
            public IntermediateData(Dictionary<string, BitArray> possibleAlignments, Dictionary<string, HitArray> observedAlignments, Dictionary<string, Int16[]> fragmentLengths, CanvasCoverageMode coverageMode)
            {
                foreach (KeyValuePair<string, BitArray> kvp in possibleAlignments)
                {
                    int bitsInLastByte = kvp.Value.Length % 8;
                    byte[] bytes = new byte[kvp.Value.Length / 8 + (bitsInLastByte == 0 ? 0 : 1)];
                    kvp.Value.CopyTo(bytes, 0);
                    this.PossibleAlignments[kvp.Key] = bytes;
                    BitsInLastBytePossibleAlignments[kvp.Key] = bitsInLastByte;
                }

                foreach (KeyValuePair<string, HitArray> kvp in observedAlignments)
                {
                    this.ObservedAlignments[kvp.Key] = kvp.Value.Data;
                }

                if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                {
                    foreach (KeyValuePair<string, Int16[]> kvp in fragmentLengths)
                    {
                        this.FragmentLengths[kvp.Key] = kvp.Value;
                    }
                }

            }
示例#13
0
        /// <summary>
        /// Bin alignments.
        /// </summary>
        /// <param name="referenceFile">Reference fasta file.</param>
        /// <param name="binSize">Desired number of alignments per bin.</param>
        /// <param name="possibleAlignments">BitArrays of possible alignments.</param>
        /// <param name="observedAlignments">BitArrays of observed alignments.</param>
        /// <param name="predefinedBins">Pre-defined bins. null if not available.</param>
        /// <returns>A list of bins.</returns>
        static List<GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest,
            Dictionary<string, BitArray> possibleAlignments,
            Dictionary<string, HitArray> observedAlignments,
            Dictionary<string, Int16[]> fragmentLengths,
            Dictionary<string, List<GenomicBin>> predefinedBins,
            string outFile)
        {
            bool debugGCCorrection = false; // write value of GC bins and correction factor
            Dictionary<string, GenericRead> fastaEntries = new Dictionary<string, GenericRead>();
            List<string> chromosomes = new List<string>();
            Int16 meanFragmentSize = 0;
            Int16 meanFragmentCutoff = 3;
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                meanFragmentSize = MeanFragmentSize(fragmentLengths);

            using (FastaReader reader = new FastaReader(referenceFile))
            {
                GenericRead fastaEntry = new GenericRead();

                // Loop through each chromosome in the reference.
                while (reader.GetNextEntry(ref fastaEntry))
                {
                    chromosomes.Add(fastaEntry.Name);
                    fastaEntries[fastaEntry.Name] = fastaEntry;
                    fastaEntry = new GenericRead();
                }
            }

            // calculate GC content of the forward read at every position along the genome  
            Dictionary<string, byte[]> readGCContent = new Dictionary<string, byte[]>();
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                byte gcCap = (byte)numberOfGCbins;
                List<ThreadStart> normalizationTasks = new List<ThreadStart>();
                foreach (KeyValuePair<string, Int16[]> fragmentLengthsKVP in fragmentLengths)
                {
                    string chr = fragmentLengthsKVP.Key;
                    GenericRead fastaEntry = fastaEntries[chr];

                    normalizationTasks.Add(new ThreadStart(() =>
                    {
                    // contains GC content of the forward read at every position for current chr
                    byte[] gcContent = new byte[fastaEntry.Bases.Length];

                        int gcCounter = 0;

                    // Iteratively calculate GC content of "reads" using fasta genome reference
                    for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++)
                        {
                            Int16 currentFragment = 0;

                            if (fragmentLengthsKVP.Value[pos] == 0)
                                currentFragment = meanFragmentSize;
                            else
                                currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff));
                            for (int i = pos; i < pos + currentFragment; i++)
                            {
                                switch (fastaEntry.Bases[i])
                                {
                                    case 'C':
                                    case 'c':
                                    case 'G':
                                    case 'g':
                                        gcCounter++;
                                        break;
                                    default:
                                        break;
                                }
                            }
                            if (gcCounter < 0)
                                gcCounter = 0;
                            gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap);
                            gcCounter = 0;
                        }
                        lock (readGCContent)
                        {
                            readGCContent[chr] = gcContent;
                        }
                    }));
                }

                Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now);
                Console.Out.Flush();
                //Parallel.ForEach(normalizationTasks, t => { t.Invoke(); });
                Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(normalizationTasks);
                Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now);
                Console.Out.Flush();
            }

            // populate observed and expected read GC bin vectors
            float[] observedVsExpectedGC = new float[0];
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile);

            Dictionary<string, List<GenomicBin>> perChromosomeBins = new Dictionary<string, List<GenomicBin>>();
            List<ThreadStart> binningTasks = new List<ThreadStart>();
            foreach (KeyValuePair<string, GenericRead> fastaEntryKVP in fastaEntries)
            {
                string chr = fastaEntryKVP.Key;
                if (!possibleAlignments.ContainsKey(chr)) continue;
                if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) continue;

                BinTaskArguments args = new BinTaskArguments();
                args.FastaEntry = fastaEntryKVP.Value;
                args.Chromosome = chr;
                args.PossibleAlignments = possibleAlignments[chr];
                args.ObservedAlignments = observedAlignments[chr];
                args.CoverageMode = coverageMode;
                perChromosomeBins[chr] = predefinedBins == null ? new List<GenomicBin>() : predefinedBins[chr];
                args.Bins = perChromosomeBins[chr];
                args.BinSize = binSize;
                if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                    args.ReadGCContent = readGCContent[chr];
                else
                    args.ReadGCContent = null;
                args.ObservedVsExpectedGC = observedVsExpectedGC;
                binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); }));
            }
            Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now);
            Console.Out.WriteLine();
            //Parallel.ForEach(binningTasks, t => { t.Invoke(); });
            Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(binningTasks);
            Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now);
            Console.Out.WriteLine();

            List<GenomicBin> finalBins = new List<GenomicBin>();
            foreach (string chr in chromosomes)
            {
                if (!perChromosomeBins.ContainsKey(chr)) continue;
                finalBins.AddRange(perChromosomeBins[chr]);
            }
            return finalBins;
        }
示例#14
0
        /// <summary>
        /// Reads in a bam file and marks within the BitArrays which genomic mers are present.
        /// </summary>
        /// <param name="bamFile">bam file read alignments from.</param>
        /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param>
        static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths)
        {
            // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome!
            string indexPath = bamFile + ".bai";
            if (!File.Exists(indexPath))
            {
                throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath));
            }

            using (BamReader reader = new BamReader(bamFile))
            {
                int desiredRefIndex = -1;
                desiredRefIndex = reader.GetReferenceIndex(chromosome);
                if (desiredRefIndex == -1)
                {
                    throw new ApplicationException(
                        string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome,
                        bamFile));
                }
                bool result = reader.Jump(desiredRefIndex, 0);
                if (!result)
                {
                    // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this 
                    // .bam file.  That is not uncommon e.g. for truseq amplicon.
                    return;
                }
                int readCount = 0;
                int keptReadCount = 0;
                string header = reader.GetHeader();
                BamAlignment alignment = new BamAlignment();
                while (reader.GetNextAlignment(ref alignment, true))
                {
                    readCount++;

                    // Flag check - Require reads to be aligned, passing filter, non-duplicate:
                    if (!alignment.IsMapped()) continue;
                    if (alignment.IsFailedQC()) continue;
                    if (alignment.IsDuplicate()) continue;
                    if (alignment.IsReverseStrand()) continue;
                    if (!alignment.IsMainAlignment()) continue;

                    // Require the alignment to start with 35 bases of non-indel:
                    if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) continue;

                    if (isPairedEnd && !alignment.IsProperPair()) continue;

                    int refID = alignment.RefID;

                    // quit if the current reference index is different from the desired reference index
                    if (refID != desiredRefIndex)
                        break;

                    if (refID == -1)
                        continue;

                    keptReadCount++;
                    if (coverageMode == CanvasCoverageMode.Binary)
                    {
                        observed.Data[alignment.Position] = 1;
                    }
                    else
                    {
                        observed.Set(alignment.Position);
                    }
                    // store fragment size, make sure it's within Int16 range and is positive (simplification for now)
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                        fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0));
                }
                Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount);
            }
        }
示例#15
0
        /// <summary>
        /// Sets up two Dictionaries holding BitArrays, one BitArray for each chromosome in a fasta file. One bit for each nucleotide.
        /// </summary>
        /// <param name="fastaFile">Fasta file containing uniquemer-marked reference genome.</param>
        /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param>
        /// <param name="observedAlignments">Stores observed alignments from a sample.</param>
        /// <param name="fragmentLengths">Stores fragment length (Int16).</param>
        static void InitializeAlignmentArrays(string fastaFile, string chromosome, CanvasCoverageMode coverageMode, IDictionary<string, BitArray> possibleAlignments, IDictionary<string, HitArray> observedAlignments, IDictionary<string, Int16[]> fragmentLengths)
        {
            string referenceBases = FastaLoader.LoadFastaSequence(fastaFile, chromosome);

            BitArray possible = new BitArray(referenceBases.Length);
            possibleAlignments[chromosome] = possible;
            observedAlignments[chromosome] = new HitArray(referenceBases.Length);
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                fragmentLengths[chromosome] = new Int16[referenceBases.Length];
            else
                fragmentLengths[chromosome] = new Int16[0];
            // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters.
            for (int i = 0; i < referenceBases.Length; i++)
            {
                if (char.IsUpper(referenceBases[i]))
                    possible[i] = true;
            }
        }