private void WriteOutputs(string fastaPath, string outputPath) { int chromosomeIndex = -1; using (FastaReader reader = new FastaReader(fastaPath)) using (FastaWriter writer = new FastaWriter(outputPath)) { GenericRead fastaEntry = new GenericRead(); while (reader.GetNextEntry(ref fastaEntry)) { chromosomeIndex++; StringBuilder baseBuilder = new StringBuilder(); BitArray nonUniqueFlags = ChromosomeNonUniqueFlags[chromosomeIndex]; for (int chromPos = 0; chromPos < fastaEntry.Bases.Length; chromPos++) { if (nonUniqueFlags[chromPos]) { baseBuilder.Append(char.ToLowerInvariant(fastaEntry.Bases[chromPos])); } else { baseBuilder.Append(char.ToUpperInvariant(fastaEntry.Bases[chromPos])); } } writer.WriteEntry(fastaEntry.Name, baseBuilder.ToString()); } } }
static public int ProcessReferenceFASTA(string fastaPathA, string fastaPathB) { GenericRead chrA = new GenericRead(); GenericRead chrB = new GenericRead(); long CountAB = 0; long CountA = 0; long CountB = 0; long CountNeither = 0; using (FastaReader readerA = new FastaReader(fastaPathA)) using (FastaReader readerB = new FastaReader(fastaPathB)) { readerA.GetNextEntry(ref chrA); // Discard chrM from new output while (true) { bool result = readerA.GetNextEntry(ref chrA); if (!result) { break; } readerB.GetNextEntry(ref chrB); if (chrA.Bases.Length != chrB.Bases.Length) { throw new Exception(); } for (int baseIndex = 0; baseIndex < chrA.Bases.Length; baseIndex++) { bool isUniqueA = chrA.Bases[baseIndex] < 'a'; bool isUniqueB = chrB.Bases[baseIndex] < 'a'; if (isUniqueA && isUniqueB) { CountAB++; } else if (isUniqueA && !isUniqueB) { CountA++; } else if (!isUniqueA && isUniqueB) { CountB++; } else { CountNeither++; } } Console.WriteLine("After {0}: {1},{2},{3},{4}", chrA.Name, CountAB, CountA, CountB, CountNeither); double percentAgreement = 100 * (CountAB + CountNeither) / (double)(CountAB + CountA + CountB + CountNeither); Console.WriteLine("Percent agreement: {0:F2}", percentAgreement); } } return(0); }
static public void CheckUniqueness() { string fastaPath = @"D:\Genomes\Homo_sapiens\UCSC\hg19\Sequence\WholeGenomeFasta\genome.fa"; string[] Reads = new string[] { "AACCCTAACCCAACCCTAACCCTAACCCTAACCCT", // 10097 B "ACCCTAACCCAACCCTAACCCTAACCCTAACCCTA", // 10098 B "AGAGGACAACGCAGCTCCGCCCTCGCGGTGCTCTC", // 10553 A "TTTTTTCCTATACATACATACCCATGATAAAGTTT" // 30763880 A }; using (FastaReader readerA = new FastaReader(fastaPath)) { GenericRead chrA = new GenericRead(); while (true) { bool result = readerA.GetNextEntry(ref chrA); if (!result) { break; } Console.WriteLine(chrA.Name); string bases = chrA.Bases.ToUpperInvariant(); // Search for each: for (int readIndex = 0; readIndex < Reads.Length; readIndex++) { int pos = -1; while (true) { pos = bases.IndexOf(Reads[readIndex], pos + 1); if (pos == -1) { break; } Console.WriteLine("{0}\t{1}\t{2}\t{3}", readIndex, Reads[readIndex], chrA.Name, pos); } pos = -1; string revComp = Isas.Shared.Utilities.GetReverseComplement(Reads[readIndex]); while (true) { pos = bases.IndexOf(revComp, pos + 1); if (pos == -1) { break; } Console.WriteLine("{0}\t{1}\t{2}\t{3}\tRevComp", readIndex, Reads[readIndex], chrA.Name, pos); } } } } Console.WriteLine(">>>Done."); }
public void Main(string fastaPath, string outputPath) { Console.WriteLine("{0} Start", DateTime.Now); Console.WriteLine("Load FASTA file at {0}, write kmer-flagged output to {1}", fastaPath, outputPath); // Make multiple passes over the file, to manage memory usage. In each pass, we'll accumulate a dictionary // of up to N kmers, and we'll flag as non-unique all occurrences of those kmers. In subsequent passes, // we know we can ignore positions that were already flagged as non-unique. We know that we're complete when // we make a pass where the dictionary doesn't fill up. this.PassIndex = 0; this.IncompletePositions = 1; // This will be set to the number of positions whose uniqueness status is UNKNOWN. HashSet <string> finishedChromosomes = new HashSet <string>(); // For speed, keep track of chromosomes that are already fully processed. while (IncompletePositions > 0) { IncompletePositions = 0; PassIndex++; Kmers.Clear(); this.GenomePosition = 0; int chromosomeIndex = -1; using (FastaReader reader = new FastaReader(fastaPath)) { GenericRead fastaEntry = new GenericRead(); while (reader.GetNextEntry(ref fastaEntry)) { chromosomeIndex++; // Speedup option: Skip over chromosome that's already been fully processed: if (finishedChromosomes.Contains(fastaEntry.Name)) { GenomePosition += fastaEntry.Bases.Length; continue; } ProcessOneChromosome(fastaEntry, chromosomeIndex); if (IncompletePositions == 0 && !finishedChromosomes.Contains(fastaEntry.Name)) { finishedChromosomes.Add(fastaEntry.Name); } } // loop over chromosomes // Now let's go back and make a note of the various positions that are now definitively known // to be unique kmers! int UniqueCount = 0; foreach (string key in Kmers.Keys) { long oldPos = Kmers[key]; if (oldPos < 0) { continue; } UniqueCount++; int tempPos = 0; for (int tempIndex = 0; tempIndex < ChromosomeNonUniqueFlags.Count; tempIndex++) { if (tempPos + ChromosomeNonUniqueFlags[tempIndex].Length > oldPos) { // Note: Assumption is that OldPos - tempIndex will always be an int // since no single chromosome's length is longer than maxint. ChromosomeFinishedFlags[tempIndex].Set((int)(oldPos - tempPos), true); break; } tempPos += ChromosomeNonUniqueFlags[tempIndex].Length; } } Console.WriteLine("{0} >>>Pass {1}: flagged {2} unique kmers", DateTime.Now, PassIndex, UniqueCount); Console.WriteLine(); } } // Pass loop Console.WriteLine("{0} Flagging complete", DateTime.Now); this.WriteOutputs(fastaPath, outputPath); Console.WriteLine("{0} Output written to {1}", DateTime.Now, outputPath); }
private void ProcessOneChromosome(GenericRead fastaEntry, int chromosomeIndex) { BitArray nonUniqueFlags; BitArray finishedFlags; if (chromosomeIndex >= ChromosomeNonUniqueFlags.Count) { nonUniqueFlags = new BitArray(fastaEntry.Bases.Length); ChromosomeNonUniqueFlags.Add(nonUniqueFlags); finishedFlags = new BitArray(fastaEntry.Bases.Length); ChromosomeFinishedFlags.Add(finishedFlags); } nonUniqueFlags = ChromosomeNonUniqueFlags[chromosomeIndex]; finishedFlags = ChromosomeFinishedFlags[chromosomeIndex]; StringBuilder keyBuilder = new StringBuilder(); string bases = fastaEntry.Bases.ToUpperInvariant(); for (int startPos = 0; startPos < bases.Length; startPos++, GenomePosition++) { if (startPos % 1000000 == 0) { Console.WriteLine(">>>{0} {1} {2} dict {3} incomplete {4}", PassIndex, fastaEntry.Name, startPos, Kmers.Keys.Count, IncompletePositions); } // Skip positions processed to completion in an earlier pass: if (finishedFlags[startPos]) { continue; } // Handle positions at very end of chromosome: if (startPos + KmerLength >= bases.Length) { nonUniqueFlags.Set(startPos, true); finishedFlags.Set(startPos, true); continue; } // This position isn't completed yet. Check its kmer against the dictionary: string kmer = bases.Substring(startPos, KmerLength); string key = GetKeyForKmer(kmer); if (key == null) { // This position isn't a valid unique 35mer, because it has an N or some other bogus character. nonUniqueFlags.Set(startPos, true); finishedFlags.Set(startPos, true); continue; } if (Kmers.ContainsKey(key)) { // This position is not unique. Flag it as known non-unique: nonUniqueFlags.Set(startPos, true); finishedFlags.Set(startPos, true); long oldPos = Kmers[key]; if (oldPos >= 0) { // Go back and flag the kmer position, even if on an old chromosome: long tempPos = 0; for (int tempIndex = 0; tempIndex < ChromosomeNonUniqueFlags.Count; tempIndex++) { if (tempPos + ChromosomeNonUniqueFlags[tempIndex].Length > oldPos) { // Note: Assumption is that OldPos - tempIndex will always be an int // since no single chromosome's length is longer than maxint. int chrPos = (int)(oldPos - tempPos); if (ChromosomeFinishedFlags[tempIndex][chrPos]) { throw new Exception("Error: Flagging an already-done position!"); } ChromosomeNonUniqueFlags[tempIndex].Set(chrPos, true); ChromosomeFinishedFlags[tempIndex].Set(chrPos, true); break; } tempPos += ChromosomeNonUniqueFlags[tempIndex].Length; } Kmers[key] = -1; } } else { if (Kmers.Keys.Count >= MaxDictEntries) { IncompletePositions++; } else { Kmers[key] = GenomePosition; } } } // loop over start positions }
/// <summary> /// Populate the list of GenomicBin objects for this chromosome. /// </summary> static void BinCountsForChromosome(BinTaskArguments arguments) { List <GenomicBin> bins = arguments.Bins; bool usePredefinedBins = bins.Any(); int predefinedBinIndex = 0; GenericRead fastaEntry = arguments.FastaEntry; //fastaEntryKVP.Value; BinState currentBin = new BinState(); string chr = arguments.Chromosome; BitArray possibleAlignments = arguments.PossibleAlignments; HitArray observedAlignments = arguments.ObservedAlignments; CanvasCoverageMode coverageMode = arguments.CoverageMode; int pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0; // Skip past leading Ns while (fastaEntry.Bases[pos].Equals('n')) { pos++; } List <float> binPositions = new List <float>(); List <int> binObservations = new List <int>(); for (; pos < fastaEntry.Bases.Length; pos++) { // Sets the start of the bin if (currentBin.StartPosition == -1) { currentBin.StartPosition = pos; } if (!fastaEntry.Bases[pos].Equals("n")) { currentBin.NucleotideCount++; } //if (Utilities.IsGC(fastaEntry.Bases[pos])) // currentBin.GCCount++; switch (fastaEntry.Bases[pos]) { case 'C': case 'c': case 'G': case 'g': currentBin.GCCount++; break; } if (possibleAlignments[pos]) { currentBin.PossibleCount++; currentBin.ObservedCount += observedAlignments.Data[pos]; binObservations.Add(observedAlignments.Data[pos]); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]); } } // We've seen the desired number of possible alignment positions. if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize) || (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1)) { if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range { currentBin.ObservedCount = 0; foreach (int Value in binObservations) { currentBin.ObservedCount += Math.Min(10, Value); } } if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted { currentBin.ObservedCount = 0; float tmpObservedCount = 0; for (int i = 0; i < binObservations.Count; i++) { tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]); } currentBin.ObservedCount = (int)Math.Round(tmpObservedCount); } int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount); if (usePredefinedBins) { bins[predefinedBinIndex].GC = gc; bins[predefinedBinIndex].Count = currentBin.ObservedCount; predefinedBinIndex++; if (predefinedBinIndex >= bins.Count) { break; } // we have processed all the bins pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin } else { // Note the pos + 1 to make the first three conform to bed specification GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount); bins.Add(bin); } // Reset all relevant variables currentBin.Reset(); binObservations.Clear(); binPositions.Clear(); } } }
/// <summary> /// Bin alignments. /// </summary> /// <param name="referenceFile">Reference fasta file.</param> /// <param name="binSize">Desired number of alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments.</param> /// <param name="observedAlignments">BitArrays of observed alignments.</param> /// <param name="predefinedBins">Pre-defined bins. null if not available.</param> /// <returns>A list of bins.</returns> static List <GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest, Dictionary <string, BitArray> possibleAlignments, Dictionary <string, HitArray> observedAlignments, Dictionary <string, Int16[]> fragmentLengths, Dictionary <string, List <GenomicBin> > predefinedBins, string outFile) { bool debugGCCorrection = false; // write value of GC bins and correction factor Dictionary <string, GenericRead> fastaEntries = new Dictionary <string, GenericRead>(); List <string> chromosomes = new List <string>(); Int16 meanFragmentSize = 0; Int16 meanFragmentCutoff = 3; if (coverageMode == CanvasCoverageMode.GCContentWeighted) { meanFragmentSize = MeanFragmentSize(fragmentLengths); } using (FastaReader reader = new FastaReader(referenceFile)) { GenericRead fastaEntry = new GenericRead(); // Loop through each chromosome in the reference. while (reader.GetNextEntry(ref fastaEntry)) { chromosomes.Add(fastaEntry.Name); fastaEntries[fastaEntry.Name] = fastaEntry; fastaEntry = new GenericRead(); } } // calculate GC content of the forward read at every position along the genome Dictionary <string, byte[]> readGCContent = new Dictionary <string, byte[]>(); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { byte gcCap = (byte)numberOfGCbins; List <ThreadStart> normalizationTasks = new List <ThreadStart>(); foreach (KeyValuePair <string, Int16[]> fragmentLengthsKVP in fragmentLengths) { string chr = fragmentLengthsKVP.Key; GenericRead fastaEntry = fastaEntries[chr]; normalizationTasks.Add(new ThreadStart(() => { // contains GC content of the forward read at every position for current chr byte[] gcContent = new byte[fastaEntry.Bases.Length]; uint gcCounter = 0; // Iteratively calculate GC content of "reads" using fasta genome reference for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++) { Int16 currentFragment = 0; if (fragmentLengthsKVP.Value[pos] == 0) { currentFragment = meanFragmentSize; } else { currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff)); } for (int i = pos; i < pos + currentFragment; i++) { switch (fastaEntry.Bases[i]) { case 'C': case 'c': case 'G': case 'g': gcCounter++; break; default: break; } } gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap); gcCounter = 0; } lock (readGCContent) { readGCContent[chr] = gcContent; } })); } Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now); Console.Out.Flush(); Isas.Shared.Utilities.DoWorkParallelThreads(normalizationTasks); Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now); Console.Out.Flush(); } // populate observed and expected read GC bin vectors float[] observedVsExpectedGC = new float[0]; if (coverageMode == CanvasCoverageMode.GCContentWeighted) { observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile); } Dictionary <string, List <GenomicBin> > perChromosomeBins = new Dictionary <string, List <GenomicBin> >(); List <ThreadStart> binningTasks = new List <ThreadStart>(); foreach (KeyValuePair <string, GenericRead> fastaEntryKVP in fastaEntries) { string chr = fastaEntryKVP.Key; if (!possibleAlignments.ContainsKey(chr)) { continue; } if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) { continue; } BinTaskArguments args = new BinTaskArguments(); args.FastaEntry = fastaEntryKVP.Value; args.Chromosome = chr; args.PossibleAlignments = possibleAlignments[chr]; args.ObservedAlignments = observedAlignments[chr]; args.CoverageMode = coverageMode; perChromosomeBins[chr] = predefinedBins == null ? new List <GenomicBin>() : predefinedBins[chr]; args.Bins = perChromosomeBins[chr]; args.BinSize = binSize; if (coverageMode == CanvasCoverageMode.GCContentWeighted) { args.ReadGCContent = readGCContent[chr]; } else { args.ReadGCContent = null; } args.ObservedVsExpectedGC = observedVsExpectedGC; binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); })); } Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now); Console.Out.WriteLine(); //Parallel.ForEach(binningTasks, t => { t.Invoke(); }); Isas.Shared.Utilities.DoWorkParallelThreads(binningTasks); Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now); Console.Out.WriteLine(); List <GenomicBin> finalBins = new List <GenomicBin>(); foreach (string chr in chromosomes) { if (!perChromosomeBins.ContainsKey(chr)) { continue; } finalBins.AddRange(perChromosomeBins[chr]); } return(finalBins); }