static public int ProcessReferenceFASTA(string fastaPathA, string fastaPathB) { GenericRead chrA = new GenericRead(); GenericRead chrB = new GenericRead(); long CountAB = 0; long CountA = 0; long CountB = 0; long CountNeither = 0; using (FastaReader readerA = new FastaReader(fastaPathA)) using (FastaReader readerB = new FastaReader(fastaPathB)) { readerA.GetNextEntry(ref chrA); // Discard chrM from new output while (true) { bool result = readerA.GetNextEntry(ref chrA); if (!result) break; readerB.GetNextEntry(ref chrB); if (chrA.Bases.Length != chrB.Bases.Length) throw new Exception(); for (int baseIndex = 0; baseIndex < chrA.Bases.Length; baseIndex++) { bool isUniqueA = chrA.Bases[baseIndex] < 'a'; bool isUniqueB = chrB.Bases[baseIndex] < 'a'; if (isUniqueA && isUniqueB) { CountAB++; } else if (isUniqueA && !isUniqueB) { CountA++; } else if (!isUniqueA && isUniqueB) { CountB++; } else { CountNeither++; } } Console.WriteLine("After {0}: {1},{2},{3},{4}", chrA.Name, CountAB, CountA, CountB, CountNeither); double percentAgreement = 100 * (CountAB + CountNeither) / (double)(CountAB + CountA + CountB + CountNeither); Console.WriteLine("Percent agreement: {0:F2}", percentAgreement); } } return 0; }
static public void CheckUniqueness() { string fastaPath = @"D:\Genomes\Homo_sapiens\UCSC\hg19\Sequence\WholeGenomeFasta\genome.fa"; string[] Reads = new string[] { "AACCCTAACCCAACCCTAACCCTAACCCTAACCCT", // 10097 B "ACCCTAACCCAACCCTAACCCTAACCCTAACCCTA", // 10098 B "AGAGGACAACGCAGCTCCGCCCTCGCGGTGCTCTC", // 10553 A "TTTTTTCCTATACATACATACCCATGATAAAGTTT" // 30763880 A }; using (FastaReader readerA = new FastaReader(fastaPath)) { GenericRead chrA = new GenericRead(); while (true) { bool result = readerA.GetNextEntry(ref chrA); if (!result) break; Console.WriteLine(chrA.Name); string bases = chrA.Bases.ToUpperInvariant(); // Search for each: for (int readIndex = 0; readIndex < Reads.Length; readIndex++) { int pos = -1; while (true) { pos = bases.IndexOf(Reads[readIndex], pos + 1); if (pos == -1) break; Console.WriteLine("{0}\t{1}\t{2}\t{3}", readIndex, Reads[readIndex], chrA.Name, pos); } pos = -1; string revComp = Illumina.SecondaryAnalysis.Utilities.GetReverseComplement(Reads[readIndex]); while (true) { pos = bases.IndexOf(revComp, pos + 1); if (pos == -1) break; Console.WriteLine("{0}\t{1}\t{2}\t{3}\tRevComp", readIndex, Reads[readIndex], chrA.Name, pos); } } } } Console.WriteLine(">>>Done."); }
// Implement IDisposable /// <summary> /// Reads the FASTA entry /// </summary> /// <returns>Returns false if no more entries are available.</returns> public bool GetNextEntry(ref GenericRead sequence) { // read the header string header = _mReader.ReadLine(); if (header == null) return false; // sanity check if (!header.StartsWith(">")) { throw new ApplicationException("Encountered a FASTA header that did not start with '>'"); } // extract the sequence name if (SkipHeaderParsing) { sequence.Name = header.Substring(1); } else { Match nameMatch = _mNameRegex.Match(header); sequence.Name = nameMatch.Groups[1].Value; } // read the bases StringBuilder sb = new StringBuilder(); int peek = _mReader.Peek(); while ((peek != -1) && (peek != '>')) { string line = _mReader.ReadLine(); if (line == null) break; sb.Append(line); peek = _mReader.Peek(); } sequence.Bases = sb.ToString(); return true; }
public void Main(string fastaPath, string outputPath) { Console.WriteLine("{0} Start", DateTime.Now); Console.WriteLine("Load FASTA file at {0}, write kmer-flagged output to {1}", fastaPath, outputPath); // Make multiple passes over the file, to manage memory usage. In each pass, we'll accumulate a dictionary // of up to N kmers, and we'll flag as non-unique all occurrences of those kmers. In subsequent passes, // we know we can ignore positions that were already flagged as non-unique. We know that we're complete when // we make a pass where the dictionary doesn't fill up. this.PassIndex = 0; this.IncompletePositions = 1; // This will be set to the number of positions whose uniqueness status is UNKNOWN. HashSet<string> finishedChromosomes = new HashSet<string>(); // For speed, keep track of chromosomes that are already fully processed. while (IncompletePositions > 0) { IncompletePositions = 0; PassIndex++; Kmers.Clear(); this.GenomePosition = 0; int chromosomeIndex = -1; using (FastaReader reader = new FastaReader(fastaPath)) { GenericRead fastaEntry = new GenericRead(); while (reader.GetNextEntry(ref fastaEntry)) { chromosomeIndex++; // Speedup option: Skip over chromosome that's already been fully processed: if (finishedChromosomes.Contains(fastaEntry.Name)) { GenomePosition += fastaEntry.Bases.Length; continue; } ProcessOneChromosome(fastaEntry, chromosomeIndex); if (IncompletePositions == 0 && !finishedChromosomes.Contains(fastaEntry.Name)) { finishedChromosomes.Add(fastaEntry.Name); } } // loop over chromosomes // Now let's go back and make a note of the various positions that are now definitively known // to be unique kmers! int UniqueCount = 0; foreach (string key in Kmers.Keys) { long oldPos = Kmers[key]; if (oldPos < 0) continue; UniqueCount++; int tempPos = 0; for (int tempIndex = 0; tempIndex < ChromosomeNonUniqueFlags.Count; tempIndex++) { if (tempPos + ChromosomeNonUniqueFlags[tempIndex].Length > oldPos) { // Note: Assumption is that OldPos - tempIndex will always be an int // since no single chromosome's length is longer than maxint. ChromosomeFinishedFlags[tempIndex].Set((int)(oldPos - tempPos), true); break; } tempPos += ChromosomeNonUniqueFlags[tempIndex].Length; } } Console.WriteLine("{0} >>>Pass {1}: flagged {2} unique kmers", DateTime.Now, PassIndex, UniqueCount); Console.WriteLine(); } } // Pass loop Console.WriteLine("{0} Flagging complete", DateTime.Now); this.WriteOutputs(fastaPath, outputPath); Console.WriteLine("{0} Output written to {1}", DateTime.Now, outputPath); }
private void WriteOutputs(string fastaPath, string outputPath) { int chromosomeIndex = -1; using (FastaReader reader = new FastaReader(fastaPath)) using (FastaWriter writer = new FastaWriter(outputPath)) { GenericRead fastaEntry = new GenericRead(); while (reader.GetNextEntry(ref fastaEntry)) { chromosomeIndex++; StringBuilder baseBuilder = new StringBuilder(); BitArray nonUniqueFlags = ChromosomeNonUniqueFlags[chromosomeIndex]; for (int chromPos = 0; chromPos < fastaEntry.Bases.Length; chromPos++) { if (nonUniqueFlags[chromPos]) { baseBuilder.Append(char.ToLowerInvariant(fastaEntry.Bases[chromPos])); } else { baseBuilder.Append(char.ToUpperInvariant(fastaEntry.Bases[chromPos])); } } writer.WriteEntry(fastaEntry.Name, baseBuilder.ToString()); } } }
private void ProcessOneChromosome(GenericRead fastaEntry, int chromosomeIndex) { BitArray nonUniqueFlags; BitArray finishedFlags; if (chromosomeIndex >= ChromosomeNonUniqueFlags.Count) { nonUniqueFlags = new BitArray(fastaEntry.Bases.Length); ChromosomeNonUniqueFlags.Add(nonUniqueFlags); finishedFlags = new BitArray(fastaEntry.Bases.Length); ChromosomeFinishedFlags.Add(finishedFlags); } nonUniqueFlags = ChromosomeNonUniqueFlags[chromosomeIndex]; finishedFlags = ChromosomeFinishedFlags[chromosomeIndex]; StringBuilder keyBuilder = new StringBuilder(); string bases = fastaEntry.Bases.ToUpperInvariant(); for (int startPos = 0; startPos < bases.Length; startPos++, GenomePosition++) { if (startPos % 1000000 == 0) { Console.WriteLine(">>>{0} {1} {2} dict {3} incomplete {4}", PassIndex, fastaEntry.Name, startPos, Kmers.Keys.Count, IncompletePositions); } // Skip positions processed to completion in an earlier pass: if (finishedFlags[startPos]) continue; // Handle positions at very end of chromosome: if (startPos + KmerLength >= bases.Length) { nonUniqueFlags.Set(startPos, true); finishedFlags.Set(startPos, true); continue; } // This position isn't completed yet. Check its kmer against the dictionary: string kmer = bases.Substring(startPos, KmerLength); string key = GetKeyForKmer(kmer); if (key == null) { // This position isn't a valid unique 35mer, because it has an N or some other bogus character. nonUniqueFlags.Set(startPos, true); finishedFlags.Set(startPos, true); continue; } if (Kmers.ContainsKey(key)) { // This position is not unique. Flag it as known non-unique: nonUniqueFlags.Set(startPos, true); finishedFlags.Set(startPos, true); long oldPos = Kmers[key]; if (oldPos >= 0) { // Go back and flag the kmer position, even if on an old chromosome: long tempPos = 0; for (int tempIndex = 0; tempIndex < ChromosomeNonUniqueFlags.Count; tempIndex++) { if (tempPos + ChromosomeNonUniqueFlags[tempIndex].Length > oldPos) { // Note: Assumption is that OldPos - tempIndex will always be an int // since no single chromosome's length is longer than maxint. int chrPos = (int)(oldPos - tempPos); if (ChromosomeFinishedFlags[tempIndex][chrPos]) { throw new Exception("Error: Flagging an already-done position!"); } ChromosomeNonUniqueFlags[tempIndex].Set(chrPos, true); ChromosomeFinishedFlags[tempIndex].Set(chrPos, true); break; } tempPos += ChromosomeNonUniqueFlags[tempIndex].Length; } Kmers[key] = -1; } } else { if (Kmers.Keys.Count >= MaxDictEntries) { IncompletePositions++; } else { Kmers[key] = GenomePosition; } } } // loop over start positions }
/// <summary> /// Bin alignments. /// </summary> /// <param name="referenceFile">Reference fasta file.</param> /// <param name="binSize">Desired number of alignments per bin.</param> /// <param name="possibleAlignments">BitArrays of possible alignments.</param> /// <param name="observedAlignments">BitArrays of observed alignments.</param> /// <param name="predefinedBins">Pre-defined bins. null if not available.</param> /// <returns>A list of bins.</returns> static List<GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest, Dictionary<string, BitArray> possibleAlignments, Dictionary<string, HitArray> observedAlignments, Dictionary<string, Int16[]> fragmentLengths, Dictionary<string, List<GenomicBin>> predefinedBins, string outFile) { bool debugGCCorrection = false; // write value of GC bins and correction factor Dictionary<string, GenericRead> fastaEntries = new Dictionary<string, GenericRead>(); List<string> chromosomes = new List<string>(); Int16 meanFragmentSize = 0; Int16 meanFragmentCutoff = 3; if (coverageMode == CanvasCoverageMode.GCContentWeighted) meanFragmentSize = MeanFragmentSize(fragmentLengths); using (FastaReader reader = new FastaReader(referenceFile)) { GenericRead fastaEntry = new GenericRead(); // Loop through each chromosome in the reference. while (reader.GetNextEntry(ref fastaEntry)) { chromosomes.Add(fastaEntry.Name); fastaEntries[fastaEntry.Name] = fastaEntry; fastaEntry = new GenericRead(); } } // calculate GC content of the forward read at every position along the genome Dictionary<string, byte[]> readGCContent = new Dictionary<string, byte[]>(); if (coverageMode == CanvasCoverageMode.GCContentWeighted) { byte gcCap = (byte)numberOfGCbins; List<ThreadStart> normalizationTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, Int16[]> fragmentLengthsKVP in fragmentLengths) { string chr = fragmentLengthsKVP.Key; GenericRead fastaEntry = fastaEntries[chr]; normalizationTasks.Add(new ThreadStart(() => { // contains GC content of the forward read at every position for current chr byte[] gcContent = new byte[fastaEntry.Bases.Length]; int gcCounter = 0; // Iteratively calculate GC content of "reads" using fasta genome reference for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++) { Int16 currentFragment = 0; if (fragmentLengthsKVP.Value[pos] == 0) currentFragment = meanFragmentSize; else currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff)); for (int i = pos; i < pos + currentFragment; i++) { switch (fastaEntry.Bases[i]) { case 'C': case 'c': case 'G': case 'g': gcCounter++; break; default: break; } } if (gcCounter < 0) gcCounter = 0; gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap); gcCounter = 0; } lock (readGCContent) { readGCContent[chr] = gcContent; } })); } Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now); Console.Out.Flush(); //Parallel.ForEach(normalizationTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(normalizationTasks); Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now); Console.Out.Flush(); } // populate observed and expected read GC bin vectors float[] observedVsExpectedGC = new float[0]; if (coverageMode == CanvasCoverageMode.GCContentWeighted) observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile); Dictionary<string, List<GenomicBin>> perChromosomeBins = new Dictionary<string, List<GenomicBin>>(); List<ThreadStart> binningTasks = new List<ThreadStart>(); foreach (KeyValuePair<string, GenericRead> fastaEntryKVP in fastaEntries) { string chr = fastaEntryKVP.Key; if (!possibleAlignments.ContainsKey(chr)) continue; if (predefinedBins != null && !predefinedBins.ContainsKey(chr)) continue; BinTaskArguments args = new BinTaskArguments(); args.FastaEntry = fastaEntryKVP.Value; args.Chromosome = chr; args.PossibleAlignments = possibleAlignments[chr]; args.ObservedAlignments = observedAlignments[chr]; args.CoverageMode = coverageMode; perChromosomeBins[chr] = predefinedBins == null ? new List<GenomicBin>() : predefinedBins[chr]; args.Bins = perChromosomeBins[chr]; args.BinSize = binSize; if (coverageMode == CanvasCoverageMode.GCContentWeighted) args.ReadGCContent = readGCContent[chr]; else args.ReadGCContent = null; args.ObservedVsExpectedGC = observedVsExpectedGC; binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); })); } Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now); Console.Out.WriteLine(); //Parallel.ForEach(binningTasks, t => { t.Invoke(); }); Illumina.SecondaryAnalysis.Utilities.DoWorkParallelThreads(binningTasks); Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now); Console.Out.WriteLine(); List<GenomicBin> finalBins = new List<GenomicBin>(); foreach (string chr in chromosomes) { if (!perChromosomeBins.ContainsKey(chr)) continue; finalBins.AddRange(perChromosomeBins[chr]); } return finalBins; }