static public int ProcessReferenceFASTA(string fastaPathA, string fastaPathB) { GenericRead chrA = new GenericRead(); GenericRead chrB = new GenericRead(); long CountAB = 0; long CountA = 0; long CountB = 0; long CountNeither = 0; using (FastaReader readerA = new FastaReader(fastaPathA)) using (FastaReader readerB = new FastaReader(fastaPathB)) { readerA.GetNextEntry(ref chrA); // Discard chrM from new output while (true) { bool result = readerA.GetNextEntry(ref chrA); if (!result) break; readerB.GetNextEntry(ref chrB); if (chrA.Bases.Length != chrB.Bases.Length) throw new Exception(); for (int baseIndex = 0; baseIndex < chrA.Bases.Length; baseIndex++) { bool isUniqueA = chrA.Bases[baseIndex] < 'a'; bool isUniqueB = chrB.Bases[baseIndex] < 'a'; if (isUniqueA && isUniqueB) { CountAB++; } else if (isUniqueA && !isUniqueB) { CountA++; } else if (!isUniqueA && isUniqueB) { CountB++; } else { CountNeither++; } } Console.WriteLine("After {0}: {1},{2},{3},{4}", chrA.Name, CountAB, CountA, CountB, CountNeither); double percentAgreement = 100 * (CountAB + CountNeither) / (double)(CountAB + CountA + CountB + CountNeither); Console.WriteLine("Percent agreement: {0:F2}", percentAgreement); } } return 0; }
static public void CheckUniqueness() { string fastaPath = @"D:\Genomes\Homo_sapiens\UCSC\hg19\Sequence\WholeGenomeFasta\genome.fa"; string[] Reads = new string[] { "AACCCTAACCCAACCCTAACCCTAACCCTAACCCT", // 10097 B "ACCCTAACCCAACCCTAACCCTAACCCTAACCCTA", // 10098 B "AGAGGACAACGCAGCTCCGCCCTCGCGGTGCTCTC", // 10553 A "TTTTTTCCTATACATACATACCCATGATAAAGTTT" // 30763880 A }; using (FastaReader readerA = new FastaReader(fastaPath)) { GenericRead chrA = new GenericRead(); while (true) { bool result = readerA.GetNextEntry(ref chrA); if (!result) break; Console.WriteLine(chrA.Name); string bases = chrA.Bases.ToUpperInvariant(); // Search for each: for (int readIndex = 0; readIndex < Reads.Length; readIndex++) { int pos = -1; while (true) { pos = bases.IndexOf(Reads[readIndex], pos + 1); if (pos == -1) break; Console.WriteLine("{0}\t{1}\t{2}\t{3}", readIndex, Reads[readIndex], chrA.Name, pos); } pos = -1; string revComp = Illumina.SecondaryAnalysis.Utilities.GetReverseComplement(Reads[readIndex]); while (true) { pos = bases.IndexOf(revComp, pos + 1); if (pos == -1) break; Console.WriteLine("{0}\t{1}\t{2}\t{3}\tRevComp", readIndex, Reads[readIndex], chrA.Name, pos); } } } } Console.WriteLine(">>>Done."); }
public void Main(string fastaPath, string outputPath) { Console.WriteLine("{0} Start", DateTime.Now); Console.WriteLine("Load FASTA file at {0}, write kmer-flagged output to {1}", fastaPath, outputPath); // Make multiple passes over the file, to manage memory usage. In each pass, we'll accumulate a dictionary // of up to N kmers, and we'll flag as non-unique all occurrences of those kmers. In subsequent passes, // we know we can ignore positions that were already flagged as non-unique. We know that we're complete when // we make a pass where the dictionary doesn't fill up. this.PassIndex = 0; this.IncompletePositions = 1; // This will be set to the number of positions whose uniqueness status is UNKNOWN. HashSet<string> finishedChromosomes = new HashSet<string>(); // For speed, keep track of chromosomes that are already fully processed. while (IncompletePositions > 0) { IncompletePositions = 0; PassIndex++; Kmers.Clear(); this.GenomePosition = 0; int chromosomeIndex = -1; using (FastaReader reader = new FastaReader(fastaPath)) { GenericRead fastaEntry = new GenericRead(); while (reader.GetNextEntry(ref fastaEntry)) { chromosomeIndex++; // Speedup option: Skip over chromosome that's already been fully processed: if (finishedChromosomes.Contains(fastaEntry.Name)) { GenomePosition += fastaEntry.Bases.Length; continue; } ProcessOneChromosome(fastaEntry, chromosomeIndex); if (IncompletePositions == 0 && !finishedChromosomes.Contains(fastaEntry.Name)) { finishedChromosomes.Add(fastaEntry.Name); } } // loop over chromosomes // Now let's go back and make a note of the various positions that are now definitively known // to be unique kmers! int UniqueCount = 0; foreach (string key in Kmers.Keys) { long oldPos = Kmers[key]; if (oldPos < 0) continue; UniqueCount++; int tempPos = 0; for (int tempIndex = 0; tempIndex < ChromosomeNonUniqueFlags.Count; tempIndex++) { if (tempPos + ChromosomeNonUniqueFlags[tempIndex].Length > oldPos) { // Note: Assumption is that OldPos - tempIndex will always be an int // since no single chromosome's length is longer than maxint. ChromosomeFinishedFlags[tempIndex].Set((int)(oldPos - tempPos), true); break; } tempPos += ChromosomeNonUniqueFlags[tempIndex].Length; } } Console.WriteLine("{0} >>>Pass {1}: flagged {2} unique kmers", DateTime.Now, PassIndex, UniqueCount); Console.WriteLine(); } } // Pass loop Console.WriteLine("{0} Flagging complete", DateTime.Now); this.WriteOutputs(fastaPath, outputPath); Console.WriteLine("{0} Output written to {1}", DateTime.Now, outputPath); }
private void WriteOutputs(string fastaPath, string outputPath) { int chromosomeIndex = -1; using (FastaReader reader = new FastaReader(fastaPath)) using (FastaWriter writer = new FastaWriter(outputPath)) { GenericRead fastaEntry = new GenericRead(); while (reader.GetNextEntry(ref fastaEntry)) { chromosomeIndex++; StringBuilder baseBuilder = new StringBuilder(); BitArray nonUniqueFlags = ChromosomeNonUniqueFlags[chromosomeIndex]; for (int chromPos = 0; chromPos < fastaEntry.Bases.Length; chromPos++) { if (nonUniqueFlags[chromPos]) { baseBuilder.Append(char.ToLowerInvariant(fastaEntry.Bases[chromPos])); } else { baseBuilder.Append(char.ToUpperInvariant(fastaEntry.Bases[chromPos])); } } writer.WriteEntry(fastaEntry.Name, baseBuilder.ToString()); } } }
/// <summary> /// Bin alignments. /// </summary> /// <param name="referenceFile">Reference fasta file.</param> /// <param name="binSize">Desired number of alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments.</param> /// <param name="observedAlignments">BitArrays of observed alignments.</param> /// <param name="predefinedBins">Pre-defined bins. null if not available.</param> /// <returns>A list of bins.</returns> static List<GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest, Dictionary<string, BitArray> possibleAlignments, Dictionary<string, HitArray> observedAlignments, Dictionary<string, Int16[]> fragmentLengths, Dictionary<string, List<GenomicBin>> predefinedBins, string outFile) { bool debugGCCorrection = false; // write value of GC bins and correction factor Dictionary<string, GenericRead> fastaEntries = new Dictionary<string, GenericRead>(); List<string> chromosomes = new List<string>(); Int16 meanFragmentSize = 0; Int16 meanFragmentCutoff = 3; if (coverageMode == CanvasCoverageMode.GCContentWeighted) meanFragmentSize = MeanFragmentSize(fragmentLengths); using (FastaReader reader = new FastaReader(referenceFile)) { GenericRead fastaEntry = new GenericRead(); // Loop through each chromosome in the reference. while (reader.GetNextEntry(ref fastaEntry)) { chromosomes.Add(fastaEntry.Name); fastaEntries[fastaEntry.Name] = fastaEntry; fastaEntry = new GenericRead(); } } // calculate GC content of the forward read at every position along the genome Dictionary<string, byte[]> readGCContent = new Dictionary<string, byte[]>(); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { byte gcCap = (byte)numberOfGCbins; List<ThreadStart> normalizationTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, Int16[]> fragmentLengthsKVP in fragmentLengths) { string chr = fragmentLengthsKVP.Key; GenericRead fastaEntry = fastaEntries[chr]; normalizationTasks.Add(new ThreadStart(() => { // contains GC content of the forward read at every position for current chr byte[] gcContent = new byte[fastaEntry.Bases.Length]; int gcCounter = 0; // Iteratively calculate GC content of "reads" using fasta genome reference for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++) { Int16 currentFragment = 0; if (fragmentLengthsKVP.Value[pos] == 0) currentFragment = meanFragmentSize; else currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff)); for (int i = pos; i < pos + currentFragment; i++) { switch (fastaEntry.Bases[i]) { case 'C': case 'c': case 'G': case 'g': gcCounter++; break; default: break; } } if (gcCounter < 0) gcCounter = 0; gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap); gcCounter = 0; } lock (readGCContent) { readGCContent[chr] = gcContent; } })); } Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now); Console.Out.Flush(); //Parallel.ForEach(normalizationTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(normalizationTasks); Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now); Console.Out.Flush(); } // populate observed and expected read GC bin vectors float[] observedVsExpectedGC = new float[0]; if (coverageMode == CanvasCoverageMode.GCContentWeighted) observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile); Dictionary<string, List<GenomicBin>> perChromosomeBins = new Dictionary<string, List<GenomicBin>>(); List<ThreadStart> binningTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, GenericRead> fastaEntryKVP in fastaEntries) { string chr = fastaEntryKVP.Key; if (!possibleAlignments.ContainsKey(chr)) continue; if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) continue; BinTaskArguments args = new BinTaskArguments(); args.FastaEntry = fastaEntryKVP.Value; args.Chromosome = chr; args.PossibleAlignments = possibleAlignments[chr]; args.ObservedAlignments = observedAlignments[chr]; args.CoverageMode = coverageMode; perChromosomeBins[chr] = predefinedBins == null ? new List<GenomicBin>() : predefinedBins[chr]; args.Bins = perChromosomeBins[chr]; args.BinSize = binSize; if (coverageMode == CanvasCoverageMode.GCContentWeighted) args.ReadGCContent = readGCContent[chr]; else args.ReadGCContent = null; args.ObservedVsExpectedGC = observedVsExpectedGC; binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); })); } Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now); Console.Out.WriteLine(); //Parallel.ForEach(binningTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(binningTasks); Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now); Console.Out.WriteLine(); List<GenomicBin> finalBins = new List<GenomicBin>(); foreach (string chr in chromosomes) { if (!perChromosomeBins.ContainsKey(chr)) continue; finalBins.AddRange(perChromosomeBins[chr]); } return finalBins; }