/// <summary> /// Sets up two Dictionaries holding BitArrays, one BitArray for each chromosome in a fasta file. One bit for each nucleotide. /// </summary> /// <param name="fastaFile">Fasta file containing uniquemer-marked reference genome.</param> /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param> /// <param name="observedAlignments">Stores observed alignments from a sample.</param> /// <param name="fragmentLengths">Stores fragment length (Int16).</param> static void InitializeAlignmentArrays(string fastaFile, string chromosome, CanvasCoverageMode coverageMode, IDictionary <string, BitArray> possibleAlignments, IDictionary <string, HitArray> observedAlignments, IDictionary <string, Int16[]> fragmentLengths) { string referenceBases = FastaLoader.LoadFastaSequence(fastaFile, chromosome); BitArray possible = new BitArray(referenceBases.Length); possibleAlignments[chromosome] = possible; observedAlignments[chromosome] = new HitArray(referenceBases.Length); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { fragmentLengths[chromosome] = new Int16[referenceBases.Length]; } else { fragmentLengths[chromosome] = new Int16[0]; } // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters. for (int i = 0; i < referenceBases.Length; i++) { if (char.IsUpper(referenceBases[i])) { possible[i] = true; } } }
public CanvasRunner Create(bool isSomatic, CanvasCoverageMode coverageMode, int countsPerBin, Dictionary <string, string> customParameters) { var settings = IsasConfigurationSettings.GetConfigSettings(); var canvasFolder = new DirectoryLocation(Isas.Framework.Utilities.Utilities.GetAssemblyFolder(typeof(CanvasRunner))); var commandManager = new CommandManager(new ExecutableProcessor(settings, _logger, canvasFolder)); var tabixWrapper = TabixWrapperFactory.GetTabixWrapper(_logger, _workDoer, commandManager); var bAlleleBedGraphWriter = new BAlleleBedGraphWriter(new BgzfBedGraphWriter(new RoundingBedGraphWriter(new BedGraphWriterFacade(), 4), tabixWrapper)); return(new CanvasRunner(_logger, _workDoer, _checkpointRunner, _runtimeExecutable, _runtimeCommandPrefix, isSomatic, coverageMode, countsPerBin, bAlleleBedGraphWriter, customParameters, canvasFolder.FullName)); }
public CanvasRunner(ILogger logger, IWorkManager workManager, ICheckpointRunner checkpointRunner, bool isSomatic, CanvasCoverageMode coverageMode, int countsPerBin, Dictionary<string, string> customParameters = null) { _logger = logger; _workManager = workManager; _checkpointRunner = checkpointRunner; _isSomatic = isSomatic; _canvasFolder = Path.Combine(Utilities.GetAssemblyFolder(typeof(CanvasRunner))); _coverageMode = coverageMode; _countsPerBin = countsPerBin; if (customParameters != null) { _customParameters = customParameters; } }
public CanvasRunner(ILogger logger, IWorkManager workManager, ICheckpointRunnerAsync checkpointRunner, bool isSomatic, CanvasCoverageMode coverageMode, int countsPerBin, Dictionary <string, string> customParameters = null) { _logger = logger; _workManager = workManager; _checkpointRunner = checkpointRunner; _isSomatic = isSomatic; _canvasFolder = Path.Combine(Utilities.GetAssemblyFolder(typeof(CanvasRunner))); _coverageMode = coverageMode; _countsPerBin = countsPerBin; if (customParameters != null) { _customParameters = new Dictionary <string, string>(customParameters, StringComparer.InvariantCultureIgnoreCase); } }
public void GetExecutablePath_test() { var logger = Substitute.For <ILogger>(); var workDoer = Substitute.For <IWorkDoer>(); var checkpointRunner = Substitute.For <ICheckpointRunner>(); Func <string, ICommandFactory> runtimePrefix = component => Substitute.For <ICommandFactory>(); string dotnetPath = @"C:\path\to\dotnet.exe"; var runtimeExecutable = new FileLocation(dotnetPath); bool isSomatic = true; var coverageMode = new CanvasCoverageMode(); int countsPerBin = 0; string canvasFolder = @"C:\path\to\Canvas\"; var bAlleleBedGraphWriter = Substitute.For <IBAlleleBedGraphWriter>(); var canvasRunner = new CanvasRunner(logger, workDoer, checkpointRunner, runtimeExecutable, runtimePrefix, isSomatic, coverageMode, countsPerBin, bAlleleBedGraphWriter, null, canvasFolder); string prefix = "something before "; var commandLineBuilder = new StringBuilder(prefix); string canvasExecutableStub = "CanvasBin"; string fullName = canvasRunner.GetExecutablePath(canvasExecutableStub, commandLineBuilder); Assert.Equal(@"C:\path\to\dotnet.exe", fullName); Assert.Equal(@"something before C:\path\to\Canvas\CanvasBin\CanvasBin.dll ", commandLineBuilder.ToString()); }
/// <summary> /// Deserialize CanvasBin object in multiple threads /// </summary> /// <param name="inputFile">inputFile with per-chromosome CanvasBin objects.</param> /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param> /// <param name="observedAlignments">Stores observed alignments from a sample.</param> /// <param name="fragmentLengths">Stores fragment length in byte format.</param> public static void DeserializeCanvasData(string inputFile, Dictionary <string, BitArray> possibleAlignments, Dictionary <string, HitArray> observedAlignments, Dictionary <string, Int16[]> fragmentLengths, Object semaphore, CanvasCoverageMode coverageMode) { IntermediateData data = null; using (FileStream stream = new FileStream(inputFile, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stopwatch watch = new Stopwatch(); watch.Start(); data = ProtoBuf.Serializer.Deserialize <IntermediateData>(stream); watch.Stop(); Console.WriteLine("File: {0}", inputFile); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); } Dictionary <string, BitArray> tempPossibleAlignments; Dictionary <string, HitArray> tempObservedAlignments; Dictionary <string, Int16[]> tempFragmentLengths; data.GetData(out tempPossibleAlignments, out tempObservedAlignments, out tempFragmentLengths); lock (semaphore) { foreach (KeyValuePair <string, BitArray> kvp in tempPossibleAlignments) { possibleAlignments.Add(kvp.Key, kvp.Value); } foreach (KeyValuePair <string, HitArray> kvp in tempObservedAlignments) { observedAlignments.Add(kvp.Key, kvp.Value); } if (coverageMode == CanvasCoverageMode.GCContentWeighted) { foreach (KeyValuePair <string, Int16[]> kvp in tempFragmentLengths) { fragmentLengths.Add(kvp.Key, kvp.Value); } } } }
/// <summary> /// Deserialize CanvasBin object in multiple threads /// </summary> /// <param name="inputFile">inputFile with per-chromosome CanvasBin objects.</param> /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param> /// <param name="observedAlignments">Stores observed alignments from a sample.</param> /// <param name="fragmentLengths">Stores fragment length in byte format.</param> public static void DeserializeCanvasData(string inputFile, Dictionary<string, BitArray> possibleAlignments, Dictionary<string, HitArray> observedAlignments, Dictionary<string, Int16[]> fragmentLengths, Object semaphore, CanvasCoverageMode coverageMode) { IntermediateData data = null; using (FileStream stream = new FileStream(inputFile, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stopwatch watch = new Stopwatch(); watch.Start(); data = ProtoBuf.Serializer.Deserialize<IntermediateData>(stream); watch.Stop(); Console.WriteLine("File: {0}", inputFile); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); } Dictionary<string, BitArray> tempPossibleAlignments; Dictionary<string, HitArray> tempObservedAlignments; Dictionary<string, Int16[]> tempFragmentLengths; data.GetData(out tempPossibleAlignments, out tempObservedAlignments, out tempFragmentLengths); lock (semaphore) { foreach (KeyValuePair<string, BitArray> kvp in tempPossibleAlignments) { possibleAlignments.Add(kvp.Key, kvp.Value); } foreach (KeyValuePair<string, HitArray> kvp in tempObservedAlignments) { observedAlignments.Add(kvp.Key, kvp.Value); } if (coverageMode == CanvasCoverageMode.GCContentWeighted) { foreach (KeyValuePair<string, Int16[]> kvp in tempFragmentLengths) { fragmentLengths.Add(kvp.Key, kvp.Value); } } } }
public IntermediateData(Dictionary <string, BitArray> possibleAlignments, Dictionary <string, HitArray> observedAlignments, Dictionary <string, Int16[]> fragmentLengths, CanvasCoverageMode coverageMode) { foreach (KeyValuePair <string, BitArray> kvp in possibleAlignments) { int bitsInLastByte = kvp.Value.Length % 8; byte[] bytes = new byte[kvp.Value.Length / 8 + (bitsInLastByte == 0 ? 0 : 1)]; kvp.Value.CopyTo(bytes, 0); this.PossibleAlignments[kvp.Key] = bytes; BitsInLastBytePossibleAlignments[kvp.Key] = bitsInLastByte; } foreach (KeyValuePair <string, HitArray> kvp in observedAlignments) { this.ObservedAlignments[kvp.Key] = kvp.Value.Data; } if (coverageMode == CanvasCoverageMode.GCContentWeighted) { foreach (KeyValuePair <string, Int16[]> kvp in fragmentLengths) { this.FragmentLengths[kvp.Key] = kvp.Value; } } }
/// <summary> /// Populate the list of GenomicBin objects for this chromosome. /// </summary> static void BinCountsForChromosome(BinTaskArguments arguments) { List <GenomicBin> bins = arguments.Bins; bool usePredefinedBins = bins.Any(); int predefinedBinIndex = 0; GenericRead fastaEntry = arguments.FastaEntry; //fastaEntryKVP.Value; BinState currentBin = new BinState(); string chr = arguments.Chromosome; BitArray possibleAlignments = arguments.PossibleAlignments; HitArray observedAlignments = arguments.ObservedAlignments; CanvasCoverageMode coverageMode = arguments.CoverageMode; int pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0; // Skip past leading Ns while (fastaEntry.Bases[pos].Equals('n')) { pos++; } List <float> binPositions = new List <float>(); List <int> binObservations = new List <int>(); for (; pos < fastaEntry.Bases.Length; pos++) { // Sets the start of the bin if (currentBin.StartPosition == -1) { currentBin.StartPosition = pos; } if (!fastaEntry.Bases[pos].Equals("n")) { currentBin.NucleotideCount++; } //if (Utilities.IsGC(fastaEntry.Bases[pos])) // currentBin.GCCount++; switch (fastaEntry.Bases[pos]) { case 'C': case 'c': case 'G': case 'g': currentBin.GCCount++; break; } if (possibleAlignments[pos]) { currentBin.PossibleCount++; currentBin.ObservedCount += observedAlignments.Data[pos]; binObservations.Add(observedAlignments.Data[pos]); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]); } } // We've seen the desired number of possible alignment positions. if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize) || (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1)) { if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range { currentBin.ObservedCount = 0; foreach (int Value in binObservations) { currentBin.ObservedCount += Math.Min(10, Value); } } if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted { currentBin.ObservedCount = 0; float tmpObservedCount = 0; for (int i = 0; i < binObservations.Count; i++) { tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]); } currentBin.ObservedCount = (int)Math.Round(tmpObservedCount); } int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount); if (usePredefinedBins) { bins[predefinedBinIndex].GC = gc; bins[predefinedBinIndex].Count = currentBin.ObservedCount; predefinedBinIndex++; if (predefinedBinIndex >= bins.Count) { break; } // we have processed all the bins pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin } else { // Note the pos + 1 to make the first three conform to bed specification GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount); bins.Add(bin); } // Reset all relevant variables currentBin.Reset(); binObservations.Clear(); binPositions.Clear(); } } }
/// <summary> /// Bin alignments. /// </summary> /// <param name="referenceFile">Reference fasta file.</param> /// <param name="binSize">Desired number of alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments.</param> /// <param name="observedAlignments">BitArrays of observed alignments.</param> /// <param name="predefinedBins">Pre-defined bins. null if not available.</param> /// <returns>A list of bins.</returns> static List <GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest, Dictionary <string, BitArray> possibleAlignments, Dictionary <string, HitArray> observedAlignments, Dictionary <string, Int16[]> fragmentLengths, Dictionary <string, List <GenomicBin> > predefinedBins, string outFile) { bool debugGCCorrection = false; // write value of GC bins and correction factor Dictionary <string, GenericRead> fastaEntries = new Dictionary <string, GenericRead>(); List <string> chromosomes = new List <string>(); Int16 meanFragmentSize = 0; Int16 meanFragmentCutoff = 3; if (coverageMode == CanvasCoverageMode.GCContentWeighted) { meanFragmentSize = MeanFragmentSize(fragmentLengths); } using (FastaReader reader = new FastaReader(referenceFile)) { GenericRead fastaEntry = new GenericRead(); // Loop through each chromosome in the reference. while (reader.GetNextEntry(ref fastaEntry)) { chromosomes.Add(fastaEntry.Name); fastaEntries[fastaEntry.Name] = fastaEntry; fastaEntry = new GenericRead(); } } // calculate GC content of the forward read at every position along the genome Dictionary <string, byte[]> readGCContent = new Dictionary <string, byte[]>(); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { byte gcCap = (byte)numberOfGCbins; List <ThreadStart> normalizationTasks = new List <ThreadStart>(); foreach (KeyValuePair <string, Int16[]> fragmentLengthsKVP in fragmentLengths) { string chr = fragmentLengthsKVP.Key; GenericRead fastaEntry = fastaEntries[chr]; normalizationTasks.Add(new ThreadStart(() => { // contains GC content of the forward read at every position for current chr byte[] gcContent = new byte[fastaEntry.Bases.Length]; uint gcCounter = 0; // Iteratively calculate GC content of "reads" using fasta genome reference for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++) { Int16 currentFragment = 0; if (fragmentLengthsKVP.Value[pos] == 0) { currentFragment = meanFragmentSize; } else { currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff)); } for (int i = pos; i < pos + currentFragment; i++) { switch (fastaEntry.Bases[i]) { case 'C': case 'c': case 'G': case 'g': gcCounter++; break; default: break; } } gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap); gcCounter = 0; } lock (readGCContent) { readGCContent[chr] = gcContent; } })); } Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now); Console.Out.Flush(); Isas.Shared.Utilities.DoWorkParallelThreads(normalizationTasks); Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now); Console.Out.Flush(); } // populate observed and expected read GC bin vectors float[] observedVsExpectedGC = new float[0]; if (coverageMode == CanvasCoverageMode.GCContentWeighted) { observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile); } Dictionary <string, List <GenomicBin> > perChromosomeBins = new Dictionary <string, List <GenomicBin> >(); List <ThreadStart> binningTasks = new List <ThreadStart>(); foreach (KeyValuePair <string, GenericRead> fastaEntryKVP in fastaEntries) { string chr = fastaEntryKVP.Key; if (!possibleAlignments.ContainsKey(chr)) { continue; } if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) { continue; } BinTaskArguments args = new BinTaskArguments(); args.FastaEntry = fastaEntryKVP.Value; args.Chromosome = chr; args.PossibleAlignments = possibleAlignments[chr]; args.ObservedAlignments = observedAlignments[chr]; args.CoverageMode = coverageMode; perChromosomeBins[chr] = predefinedBins == null ? new List <GenomicBin>() : predefinedBins[chr]; args.Bins = perChromosomeBins[chr]; args.BinSize = binSize; if (coverageMode == CanvasCoverageMode.GCContentWeighted) { args.ReadGCContent = readGCContent[chr]; } else { args.ReadGCContent = null; } args.ObservedVsExpectedGC = observedVsExpectedGC; binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); })); } Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now); Console.Out.WriteLine(); //Parallel.ForEach(binningTasks, t => { t.Invoke(); }); Isas.Shared.Utilities.DoWorkParallelThreads(binningTasks); Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now); Console.Out.WriteLine(); List <GenomicBin> finalBins = new List <GenomicBin>(); foreach (string chr in chromosomes) { if (!perChromosomeBins.ContainsKey(chr)) { continue; } finalBins.AddRange(perChromosomeBins[chr]); } return(finalBins); }
/// <summary> /// Reads in a bam file and marks within the BitArrays which genomic mers are present. /// </summary> /// <param name="bamFile">bam file read alignments from.</param> /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param> static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths) { // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome! string indexPath = bamFile + ".bai"; if (!File.Exists(indexPath)) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath)); } using (BamReader reader = new BamReader(bamFile)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome, bamFile)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } int readCount = 0; int keptReadCount = 0; string header = reader.GetHeader(); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { readCount++; // Flag check - Require reads to be aligned, passing filter, non-duplicate: if (!alignment.IsMapped()) { continue; } if (alignment.IsFailedQC()) { continue; } if (alignment.IsDuplicate()) { continue; } if (alignment.IsReverseStrand()) { continue; } if (!alignment.IsMainAlignment()) { continue; } // Require the alignment to start with 35 bases of non-indel: if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) { continue; } if (isPairedEnd && !alignment.IsProperPair()) { continue; } int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) { break; } if (refID == -1) { continue; } keptReadCount++; if (coverageMode == CanvasCoverageMode.Binary) { observed.Data[alignment.Position] = 1; } else { observed.Set(alignment.Position); } // store fragment size, make sure it's within Int16 range and is positive (simplification for now) if (coverageMode == CanvasCoverageMode.GCContentWeighted) { fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0)); } } Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount); } }
public IntermediateData(Dictionary<string, BitArray> possibleAlignments, Dictionary<string, HitArray> observedAlignments, Dictionary<string, Int16[]> fragmentLengths, CanvasCoverageMode coverageMode) { foreach (KeyValuePair<string, BitArray> kvp in possibleAlignments) { int bitsInLastByte = kvp.Value.Length % 8; byte[] bytes = new byte[kvp.Value.Length / 8 + (bitsInLastByte == 0 ? 0 : 1)]; kvp.Value.CopyTo(bytes, 0); this.PossibleAlignments[kvp.Key] = bytes; BitsInLastBytePossibleAlignments[kvp.Key] = bitsInLastByte; } foreach (KeyValuePair<string, HitArray> kvp in observedAlignments) { this.ObservedAlignments[kvp.Key] = kvp.Value.Data; } if (coverageMode == CanvasCoverageMode.GCContentWeighted) { foreach (KeyValuePair<string, Int16[]> kvp in fragmentLengths) { this.FragmentLengths[kvp.Key] = kvp.Value; } } }
/// <summary> /// Bin alignments. /// </summary> /// <param name="referenceFile">Reference fasta file.</param> /// <param name="binSize">Desired number of alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments.</param> /// <param name="observedAlignments">BitArrays of observed alignments.</param> /// <param name="predefinedBins">Pre-defined bins. null if not available.</param> /// <returns>A list of bins.</returns> static List<GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest, Dictionary<string, BitArray> possibleAlignments, Dictionary<string, HitArray> observedAlignments, Dictionary<string, Int16[]> fragmentLengths, Dictionary<string, List<GenomicBin>> predefinedBins, string outFile) { bool debugGCCorrection = false; // write value of GC bins and correction factor Dictionary<string, GenericRead> fastaEntries = new Dictionary<string, GenericRead>(); List<string> chromosomes = new List<string>(); Int16 meanFragmentSize = 0; Int16 meanFragmentCutoff = 3; if (coverageMode == CanvasCoverageMode.GCContentWeighted) meanFragmentSize = MeanFragmentSize(fragmentLengths); using (FastaReader reader = new FastaReader(referenceFile)) { GenericRead fastaEntry = new GenericRead(); // Loop through each chromosome in the reference. while (reader.GetNextEntry(ref fastaEntry)) { chromosomes.Add(fastaEntry.Name); fastaEntries[fastaEntry.Name] = fastaEntry; fastaEntry = new GenericRead(); } } // calculate GC content of the forward read at every position along the genome Dictionary<string, byte[]> readGCContent = new Dictionary<string, byte[]>(); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { byte gcCap = (byte)numberOfGCbins; List<ThreadStart> normalizationTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, Int16[]> fragmentLengthsKVP in fragmentLengths) { string chr = fragmentLengthsKVP.Key; GenericRead fastaEntry = fastaEntries[chr]; normalizationTasks.Add(new ThreadStart(() => { // contains GC content of the forward read at every position for current chr byte[] gcContent = new byte[fastaEntry.Bases.Length]; int gcCounter = 0; // Iteratively calculate GC content of "reads" using fasta genome reference for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++) { Int16 currentFragment = 0; if (fragmentLengthsKVP.Value[pos] == 0) currentFragment = meanFragmentSize; else currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff)); for (int i = pos; i < pos + currentFragment; i++) { switch (fastaEntry.Bases[i]) { case 'C': case 'c': case 'G': case 'g': gcCounter++; break; default: break; } } if (gcCounter < 0) gcCounter = 0; gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap); gcCounter = 0; } lock (readGCContent) { readGCContent[chr] = gcContent; } })); } Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now); Console.Out.Flush(); //Parallel.ForEach(normalizationTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(normalizationTasks); Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now); Console.Out.Flush(); } // populate observed and expected read GC bin vectors float[] observedVsExpectedGC = new float[0]; if (coverageMode == CanvasCoverageMode.GCContentWeighted) observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile); Dictionary<string, List<GenomicBin>> perChromosomeBins = new Dictionary<string, List<GenomicBin>>(); List<ThreadStart> binningTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, GenericRead> fastaEntryKVP in fastaEntries) { string chr = fastaEntryKVP.Key; if (!possibleAlignments.ContainsKey(chr)) continue; if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) continue; BinTaskArguments args = new BinTaskArguments(); args.FastaEntry = fastaEntryKVP.Value; args.Chromosome = chr; args.PossibleAlignments = possibleAlignments[chr]; args.ObservedAlignments = observedAlignments[chr]; args.CoverageMode = coverageMode; perChromosomeBins[chr] = predefinedBins == null ? new List<GenomicBin>() : predefinedBins[chr]; args.Bins = perChromosomeBins[chr]; args.BinSize = binSize; if (coverageMode == CanvasCoverageMode.GCContentWeighted) args.ReadGCContent = readGCContent[chr]; else args.ReadGCContent = null; args.ObservedVsExpectedGC = observedVsExpectedGC; binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); })); } Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now); Console.Out.WriteLine(); //Parallel.ForEach(binningTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(binningTasks); Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now); Console.Out.WriteLine(); List<GenomicBin> finalBins = new List<GenomicBin>(); foreach (string chr in chromosomes) { if (!perChromosomeBins.ContainsKey(chr)) continue; finalBins.AddRange(perChromosomeBins[chr]); } return finalBins; }
/// <summary> /// Reads in a bam file and marks within the BitArrays which genomic mers are present. /// </summary> /// <param name="bamFile">bam file read alignments from.</param> /// <param name="observedAlignments">Dictioanry of BitArrays, one for each chromosome, to store the alignments in.</param> static void LoadObservedAlignmentsBAM(string bamFile, bool isPairedEnd, string chromosome, CanvasCoverageMode coverageMode, HitArray observed, Int16[] fragmentLengths) { // Sanity check: The .bai file must exist, in order for us to seek to our target chromosome! string indexPath = bamFile + ".bai"; if (!File.Exists(indexPath)) { throw new Exception(string.Format("Fatal error: Bam index not found at {0}", indexPath)); } using (BamReader reader = new BamReader(bamFile)) { int desiredRefIndex = -1; desiredRefIndex = reader.GetReferenceIndex(chromosome); if (desiredRefIndex == -1) { throw new ApplicationException( string.Format("Unable to retrieve the reference sequence index for {0} in {1}.", chromosome, bamFile)); } bool result = reader.Jump(desiredRefIndex, 0); if (!result) { // Note: This is not necessarily an error, it just means that there *are* no reads for this chromosome in this // .bam file. That is not uncommon e.g. for truseq amplicon. return; } int readCount = 0; int keptReadCount = 0; string header = reader.GetHeader(); BamAlignment alignment = new BamAlignment(); while (reader.GetNextAlignment(ref alignment, true)) { readCount++; // Flag check - Require reads to be aligned, passing filter, non-duplicate: if (!alignment.IsMapped()) continue; if (alignment.IsFailedQC()) continue; if (alignment.IsDuplicate()) continue; if (alignment.IsReverseStrand()) continue; if (!alignment.IsMainAlignment()) continue; // Require the alignment to start with 35 bases of non-indel: if (alignment.CigarData[0].Type != 'M' || alignment.CigarData[0].Length < 35) continue; if (isPairedEnd && !alignment.IsProperPair()) continue; int refID = alignment.RefID; // quit if the current reference index is different from the desired reference index if (refID != desiredRefIndex) break; if (refID == -1) continue; keptReadCount++; if (coverageMode == CanvasCoverageMode.Binary) { observed.Data[alignment.Position] = 1; } else { observed.Set(alignment.Position); } // store fragment size, make sure it's within Int16 range and is positive (simplification for now) if (coverageMode == CanvasCoverageMode.GCContentWeighted) fragmentLengths[alignment.Position] = Convert.ToInt16(Math.Max(Math.Min(Int16.MaxValue, alignment.FragmentLength), 0)); } Console.WriteLine("Kept {0} of {1} total reads", keptReadCount, readCount); } }
/// <summary> /// Sets up two Dictionaries holding BitArrays, one BitArray for each chromosome in a fasta file. One bit for each nucleotide. /// </summary> /// <param name="fastaFile">Fasta file containing uniquemer-marked reference genome.</param> /// <param name="possibleAlignments">Stores which alignments are possible (perfect and unique).</param> /// <param name="observedAlignments">Stores observed alignments from a sample.</param> /// <param name="fragmentLengths">Stores fragment length (Int16).</param> static void InitializeAlignmentArrays(string fastaFile, string chromosome, CanvasCoverageMode coverageMode, IDictionary<string, BitArray> possibleAlignments, IDictionary<string, HitArray> observedAlignments, IDictionary<string, Int16[]> fragmentLengths) { string referenceBases = FastaLoader.LoadFastaSequence(fastaFile, chromosome); BitArray possible = new BitArray(referenceBases.Length); possibleAlignments[chromosome] = possible; observedAlignments[chromosome] = new HitArray(referenceBases.Length); if (coverageMode == CanvasCoverageMode.GCContentWeighted) fragmentLengths[chromosome] = new Int16[referenceBases.Length]; else fragmentLengths[chromosome] = new Int16[0]; // Mark which k-mers in the fasta file are unique. These are indicated by upper-case letters. for (int i = 0; i < referenceBases.Length; i++) { if (char.IsUpper(referenceBases[i])) possible[i] = true; } }