示例#1
0
        private void WriteOutputs(string fastaPath, string outputPath)
        {
            int chromosomeIndex = -1;

            using (FastaReader reader = new FastaReader(fastaPath))
                using (FastaWriter writer = new FastaWriter(outputPath))
                {
                    GenericRead fastaEntry = new GenericRead();
                    while (reader.GetNextEntry(ref fastaEntry))
                    {
                        chromosomeIndex++;
                        StringBuilder baseBuilder    = new StringBuilder();
                        BitArray      nonUniqueFlags = ChromosomeNonUniqueFlags[chromosomeIndex];
                        for (int chromPos = 0; chromPos < fastaEntry.Bases.Length; chromPos++)
                        {
                            if (nonUniqueFlags[chromPos])
                            {
                                baseBuilder.Append(char.ToLowerInvariant(fastaEntry.Bases[chromPos]));
                            }
                            else
                            {
                                baseBuilder.Append(char.ToUpperInvariant(fastaEntry.Bases[chromPos]));
                            }
                        }
                        writer.WriteEntry(fastaEntry.Name, baseBuilder.ToString());
                    }
                }
        }
示例#2
0
        static public int ProcessReferenceFASTA(string fastaPathA, string fastaPathB)
        {
            GenericRead chrA         = new GenericRead();
            GenericRead chrB         = new GenericRead();
            long        CountAB      = 0;
            long        CountA       = 0;
            long        CountB       = 0;
            long        CountNeither = 0;

            using (FastaReader readerA = new FastaReader(fastaPathA))
                using (FastaReader readerB = new FastaReader(fastaPathB))
                {
                    readerA.GetNextEntry(ref chrA); // Discard chrM from new output
                    while (true)
                    {
                        bool result = readerA.GetNextEntry(ref chrA);
                        if (!result)
                        {
                            break;
                        }
                        readerB.GetNextEntry(ref chrB);
                        if (chrA.Bases.Length != chrB.Bases.Length)
                        {
                            throw new Exception();
                        }
                        for (int baseIndex = 0; baseIndex < chrA.Bases.Length; baseIndex++)
                        {
                            bool isUniqueA = chrA.Bases[baseIndex] < 'a';
                            bool isUniqueB = chrB.Bases[baseIndex] < 'a';
                            if (isUniqueA && isUniqueB)
                            {
                                CountAB++;
                            }
                            else if (isUniqueA && !isUniqueB)
                            {
                                CountA++;
                            }
                            else if (!isUniqueA && isUniqueB)
                            {
                                CountB++;
                            }
                            else
                            {
                                CountNeither++;
                            }
                        }
                        Console.WriteLine("After {0}: {1},{2},{3},{4}", chrA.Name,
                                          CountAB, CountA, CountB, CountNeither);
                        double percentAgreement = 100 * (CountAB + CountNeither) / (double)(CountAB + CountA + CountB + CountNeither);
                        Console.WriteLine("Percent agreement: {0:F2}", percentAgreement);
                    }
                }
            return(0);
        }
示例#3
0
        static public void CheckUniqueness()
        {
            string fastaPath = @"D:\Genomes\Homo_sapiens\UCSC\hg19\Sequence\WholeGenomeFasta\genome.fa";

            string[] Reads = new string[]
            {
                "AACCCTAACCCAACCCTAACCCTAACCCTAACCCT", // 10097 B
                "ACCCTAACCCAACCCTAACCCTAACCCTAACCCTA", // 10098 B
                "AGAGGACAACGCAGCTCCGCCCTCGCGGTGCTCTC", // 10553 A
                "TTTTTTCCTATACATACATACCCATGATAAAGTTT"  // 30763880 A
            };

            using (FastaReader readerA = new FastaReader(fastaPath))
            {
                GenericRead chrA = new GenericRead();
                while (true)
                {
                    bool result = readerA.GetNextEntry(ref chrA);
                    if (!result)
                    {
                        break;
                    }
                    Console.WriteLine(chrA.Name);
                    string bases = chrA.Bases.ToUpperInvariant();
                    // Search for each:

                    for (int readIndex = 0; readIndex < Reads.Length; readIndex++)
                    {
                        int pos = -1;
                        while (true)
                        {
                            pos = bases.IndexOf(Reads[readIndex], pos + 1);
                            if (pos == -1)
                            {
                                break;
                            }
                            Console.WriteLine("{0}\t{1}\t{2}\t{3}", readIndex, Reads[readIndex], chrA.Name, pos);
                        }
                        pos = -1;
                        string revComp = Isas.Shared.Utilities.GetReverseComplement(Reads[readIndex]);
                        while (true)
                        {
                            pos = bases.IndexOf(revComp, pos + 1);
                            if (pos == -1)
                            {
                                break;
                            }
                            Console.WriteLine("{0}\t{1}\t{2}\t{3}\tRevComp", readIndex, Reads[readIndex], chrA.Name, pos);
                        }
                    }
                }
            }
            Console.WriteLine(">>>Done.");
        }
示例#4
0
        public void Main(string fastaPath, string outputPath)
        {
            Console.WriteLine("{0} Start", DateTime.Now);
            Console.WriteLine("Load FASTA file at {0}, write kmer-flagged output to {1}", fastaPath, outputPath);
            // Make multiple passes over the file, to manage memory usage.  In each pass, we'll accumulate a dictionary
            // of up to N kmers, and we'll flag as non-unique all occurrences of those kmers.  In subsequent passes,
            // we know we can ignore positions that were already flagged as non-unique.  We know that we're complete when
            // we make a pass where the dictionary doesn't fill up.
            this.PassIndex           = 0;
            this.IncompletePositions = 1;                                  // This will be set to the number of positions whose uniqueness status is UNKNOWN.
            HashSet <string> finishedChromosomes = new HashSet <string>(); // For speed, keep track of chromosomes that are already fully processed.

            while (IncompletePositions > 0)
            {
                IncompletePositions = 0;
                PassIndex++;
                Kmers.Clear();
                this.GenomePosition = 0;
                int chromosomeIndex = -1;
                using (FastaReader reader = new FastaReader(fastaPath))
                {
                    GenericRead fastaEntry = new GenericRead();
                    while (reader.GetNextEntry(ref fastaEntry))
                    {
                        chromosomeIndex++;
                        // Speedup option: Skip over chromosome that's already been fully processed:
                        if (finishedChromosomes.Contains(fastaEntry.Name))
                        {
                            GenomePosition += fastaEntry.Bases.Length;
                            continue;
                        }
                        ProcessOneChromosome(fastaEntry, chromosomeIndex);
                        if (IncompletePositions == 0 && !finishedChromosomes.Contains(fastaEntry.Name))
                        {
                            finishedChromosomes.Add(fastaEntry.Name);
                        }
                    } // loop over chromosomes

                    // Now let's go back and make a note of the various positions that are now definitively known
                    // to be unique kmers!
                    int UniqueCount = 0;
                    foreach (string key in Kmers.Keys)
                    {
                        long oldPos = Kmers[key];
                        if (oldPos < 0)
                        {
                            continue;
                        }
                        UniqueCount++;
                        int tempPos = 0;
                        for (int tempIndex = 0; tempIndex < ChromosomeNonUniqueFlags.Count; tempIndex++)
                        {
                            if (tempPos + ChromosomeNonUniqueFlags[tempIndex].Length > oldPos)
                            {
                                // Note: Assumption is that OldPos - tempIndex will always be an int
                                // since no single chromosome's length is longer than maxint.
                                ChromosomeFinishedFlags[tempIndex].Set((int)(oldPos - tempPos), true);
                                break;
                            }
                            tempPos += ChromosomeNonUniqueFlags[tempIndex].Length;
                        }
                    }
                    Console.WriteLine("{0} >>>Pass {1}: flagged {2} unique kmers", DateTime.Now, PassIndex, UniqueCount);
                    Console.WriteLine();
                }
            } // Pass loop

            Console.WriteLine("{0} Flagging complete", DateTime.Now);
            this.WriteOutputs(fastaPath, outputPath);
            Console.WriteLine("{0} Output written to {1}", DateTime.Now, outputPath);
        }
示例#5
0
        private void ProcessOneChromosome(GenericRead fastaEntry, int chromosomeIndex)
        {
            BitArray nonUniqueFlags;
            BitArray finishedFlags;

            if (chromosomeIndex >= ChromosomeNonUniqueFlags.Count)
            {
                nonUniqueFlags = new BitArray(fastaEntry.Bases.Length);
                ChromosomeNonUniqueFlags.Add(nonUniqueFlags);
                finishedFlags = new BitArray(fastaEntry.Bases.Length);
                ChromosomeFinishedFlags.Add(finishedFlags);
            }
            nonUniqueFlags = ChromosomeNonUniqueFlags[chromosomeIndex];
            finishedFlags  = ChromosomeFinishedFlags[chromosomeIndex];

            StringBuilder keyBuilder = new StringBuilder();

            string bases = fastaEntry.Bases.ToUpperInvariant();

            for (int startPos = 0; startPos < bases.Length; startPos++, GenomePosition++)
            {
                if (startPos % 1000000 == 0)
                {
                    Console.WriteLine(">>>{0} {1} {2} dict {3} incomplete {4}", PassIndex, fastaEntry.Name, startPos, Kmers.Keys.Count, IncompletePositions);
                }

                // Skip positions processed to completion in an earlier pass:
                if (finishedFlags[startPos])
                {
                    continue;
                }

                // Handle positions at very end of chromosome:
                if (startPos + KmerLength >= bases.Length)
                {
                    nonUniqueFlags.Set(startPos, true);
                    finishedFlags.Set(startPos, true);
                    continue;
                }

                // This position isn't completed yet.  Check its kmer against the dictionary:
                string kmer = bases.Substring(startPos, KmerLength);

                string key = GetKeyForKmer(kmer);

                if (key == null)
                {
                    // This position isn't a valid unique 35mer, because it has an N or some other bogus character.
                    nonUniqueFlags.Set(startPos, true);
                    finishedFlags.Set(startPos, true);
                    continue;
                }

                if (Kmers.ContainsKey(key))
                {
                    // This position is not unique.  Flag it as known non-unique:
                    nonUniqueFlags.Set(startPos, true);
                    finishedFlags.Set(startPos, true);

                    long oldPos = Kmers[key];
                    if (oldPos >= 0)
                    {
                        // Go back and flag the kmer position, even if on an old chromosome:
                        long tempPos = 0;
                        for (int tempIndex = 0; tempIndex < ChromosomeNonUniqueFlags.Count; tempIndex++)
                        {
                            if (tempPos + ChromosomeNonUniqueFlags[tempIndex].Length > oldPos)
                            {
                                // Note: Assumption is that OldPos - tempIndex will always be an int
                                // since no single chromosome's length is longer than maxint.
                                int chrPos = (int)(oldPos - tempPos);
                                if (ChromosomeFinishedFlags[tempIndex][chrPos])
                                {
                                    throw new Exception("Error: Flagging an already-done position!");
                                }

                                ChromosomeNonUniqueFlags[tempIndex].Set(chrPos, true);
                                ChromosomeFinishedFlags[tempIndex].Set(chrPos, true);
                                break;
                            }
                            tempPos += ChromosomeNonUniqueFlags[tempIndex].Length;
                        }
                        Kmers[key] = -1;
                    }
                }
                else
                {
                    if (Kmers.Keys.Count >= MaxDictEntries)
                    {
                        IncompletePositions++;
                    }
                    else
                    {
                        Kmers[key] = GenomePosition;
                    }
                }
            } // loop over start positions
        }
示例#6
0
        /// <summary>
        /// Populate the list of GenomicBin objects for this chromosome.
        /// </summary>
        static void BinCountsForChromosome(BinTaskArguments arguments)
        {
            List <GenomicBin> bins = arguments.Bins;
            bool               usePredefinedBins  = bins.Any();
            int                predefinedBinIndex = 0;
            GenericRead        fastaEntry         = arguments.FastaEntry; //fastaEntryKVP.Value;
            BinState           currentBin         = new BinState();
            string             chr = arguments.Chromosome;
            BitArray           possibleAlignments = arguments.PossibleAlignments;
            HitArray           observedAlignments = arguments.ObservedAlignments;
            CanvasCoverageMode coverageMode       = arguments.CoverageMode;
            int                pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0;

            // Skip past leading Ns
            while (fastaEntry.Bases[pos].Equals('n'))
            {
                pos++;
            }
            List <float> binPositions    = new List <float>();
            List <int>   binObservations = new List <int>();

            for (; pos < fastaEntry.Bases.Length; pos++)
            {
                // Sets the start of the bin
                if (currentBin.StartPosition == -1)
                {
                    currentBin.StartPosition = pos;
                }

                if (!fastaEntry.Bases[pos].Equals("n"))
                {
                    currentBin.NucleotideCount++;
                }


                //if (Utilities.IsGC(fastaEntry.Bases[pos]))
                //    currentBin.GCCount++;
                switch (fastaEntry.Bases[pos])
                {
                case 'C':
                case 'c':
                case 'G':
                case 'g':
                    currentBin.GCCount++;
                    break;
                }

                if (possibleAlignments[pos])
                {
                    currentBin.PossibleCount++;
                    currentBin.ObservedCount += observedAlignments.Data[pos];
                    binObservations.Add(observedAlignments.Data[pos]);
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                    {
                        binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]);
                    }
                }

                // We've seen the desired number of possible alignment positions.
                if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize) ||
                    (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1))
                {
                    if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range
                    {
                        currentBin.ObservedCount = 0;
                        foreach (int Value in binObservations)
                        {
                            currentBin.ObservedCount += Math.Min(10, Value);
                        }
                    }
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted
                    {
                        currentBin.ObservedCount = 0;
                        float tmpObservedCount = 0;
                        for (int i = 0; i < binObservations.Count; i++)
                        {
                            tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]);
                        }
                        currentBin.ObservedCount = (int)Math.Round(tmpObservedCount);
                    }

                    int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount);

                    if (usePredefinedBins)
                    {
                        bins[predefinedBinIndex].GC    = gc;
                        bins[predefinedBinIndex].Count = currentBin.ObservedCount;
                        predefinedBinIndex++;
                        if (predefinedBinIndex >= bins.Count)
                        {
                            break;
                        }                                         // we have processed all the bins
                        pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin
                    }
                    else
                    {
                        // Note the pos + 1 to make the first three conform to bed specification
                        GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount);
                        bins.Add(bin);
                    }

                    // Reset all relevant variables
                    currentBin.Reset();
                    binObservations.Clear();
                    binPositions.Clear();
                }
            }
        }
示例#7
0
        /// <summary>
        /// Bin alignments.
        /// </summary>
        /// <param name="referenceFile">Reference fasta file.</param>
        /// <param name="binSize">Desired number of alignments per bin.</param>
        /// <param name="possibleAlignments">BitArrays of possible alignments.</param>
        /// <param name="observedAlignments">BitArrays of observed alignments.</param>
        /// <param name="predefinedBins">Pre-defined bins. null if not available.</param>
        /// <returns>A list of bins.</returns>
        static List <GenomicBin> BinCounts(string referenceFile, int binSize, CanvasCoverageMode coverageMode, NexteraManifest manifest,
                                           Dictionary <string, BitArray> possibleAlignments,
                                           Dictionary <string, HitArray> observedAlignments,
                                           Dictionary <string, Int16[]> fragmentLengths,
                                           Dictionary <string, List <GenomicBin> > predefinedBins,
                                           string outFile)
        {
            bool debugGCCorrection = false; // write value of GC bins and correction factor
            Dictionary <string, GenericRead> fastaEntries = new Dictionary <string, GenericRead>();
            List <string> chromosomes        = new List <string>();
            Int16         meanFragmentSize   = 0;
            Int16         meanFragmentCutoff = 3;

            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                meanFragmentSize = MeanFragmentSize(fragmentLengths);
            }

            using (FastaReader reader = new FastaReader(referenceFile))
            {
                GenericRead fastaEntry = new GenericRead();

                // Loop through each chromosome in the reference.
                while (reader.GetNextEntry(ref fastaEntry))
                {
                    chromosomes.Add(fastaEntry.Name);
                    fastaEntries[fastaEntry.Name] = fastaEntry;
                    fastaEntry = new GenericRead();
                }
            }

            // calculate GC content of the forward read at every position along the genome
            Dictionary <string, byte[]> readGCContent = new Dictionary <string, byte[]>();

            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                byte gcCap = (byte)numberOfGCbins;
                List <ThreadStart> normalizationTasks = new List <ThreadStart>();
                foreach (KeyValuePair <string, Int16[]> fragmentLengthsKVP in fragmentLengths)
                {
                    string      chr        = fragmentLengthsKVP.Key;
                    GenericRead fastaEntry = fastaEntries[chr];

                    normalizationTasks.Add(new ThreadStart(() =>
                    {
                        // contains GC content of the forward read at every position for current chr
                        byte[] gcContent = new byte[fastaEntry.Bases.Length];

                        uint gcCounter = 0;

                        // Iteratively calculate GC content of "reads" using fasta genome reference
                        for (int pos = 0; pos < fastaEntry.Bases.Length - meanFragmentSize * meanFragmentCutoff - 1; pos++)
                        {
                            Int16 currentFragment = 0;

                            if (fragmentLengthsKVP.Value[pos] == 0)
                            {
                                currentFragment = meanFragmentSize;
                            }
                            else
                            {
                                currentFragment = Convert.ToInt16(Math.Min(fragmentLengthsKVP.Value[pos], meanFragmentSize * meanFragmentCutoff));
                            }
                            for (int i = pos; i < pos + currentFragment; i++)
                            {
                                switch (fastaEntry.Bases[i])
                                {
                                case 'C':
                                case 'c':
                                case 'G':
                                case 'g':
                                    gcCounter++;
                                    break;

                                default:
                                    break;
                                }
                            }
                            gcContent[pos] = (byte)Math.Min(100 * gcCounter / currentFragment, gcCap);
                            gcCounter      = 0;
                        }
                        lock (readGCContent)
                        {
                            readGCContent[chr] = gcContent;
                        }
                    }));
                }

                Console.WriteLine("{0} Launching normalization tasks.", DateTime.Now);
                Console.Out.Flush();
                Isas.Shared.Utilities.DoWorkParallelThreads(normalizationTasks);
                Console.WriteLine("{0} Normalization tasks complete.", DateTime.Now);
                Console.Out.Flush();
            }

            // populate observed and expected read GC bin vectors
            float[] observedVsExpectedGC = new float[0];
            if (coverageMode == CanvasCoverageMode.GCContentWeighted)
            {
                observedVsExpectedGC = ComputeObservedVsExpectedGC(observedAlignments, readGCContent, manifest, debugGCCorrection, outFile);
            }

            Dictionary <string, List <GenomicBin> > perChromosomeBins = new Dictionary <string, List <GenomicBin> >();
            List <ThreadStart> binningTasks = new List <ThreadStart>();

            foreach (KeyValuePair <string, GenericRead> fastaEntryKVP in fastaEntries)
            {
                string chr = fastaEntryKVP.Key;
                if (!possibleAlignments.ContainsKey(chr))
                {
                    continue;
                }
                if (predefinedBins != null && !predefinedBins.ContainsKey(chr))
                {
                    continue;
                }

                BinTaskArguments args = new BinTaskArguments();
                args.FastaEntry         = fastaEntryKVP.Value;
                args.Chromosome         = chr;
                args.PossibleAlignments = possibleAlignments[chr];
                args.ObservedAlignments = observedAlignments[chr];
                args.CoverageMode       = coverageMode;
                perChromosomeBins[chr]  = predefinedBins == null ? new List <GenomicBin>() : predefinedBins[chr];
                args.Bins    = perChromosomeBins[chr];
                args.BinSize = binSize;
                if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                {
                    args.ReadGCContent = readGCContent[chr];
                }
                else
                {
                    args.ReadGCContent = null;
                }
                args.ObservedVsExpectedGC = observedVsExpectedGC;
                binningTasks.Add(new ThreadStart(() => { BinCountsForChromosome(args); }));
            }
            Console.WriteLine("{0} Launch BinCountsForChromosome jobs...", DateTime.Now);
            Console.Out.WriteLine();
            //Parallel.ForEach(binningTasks, t => { t.Invoke(); });
            Isas.Shared.Utilities.DoWorkParallelThreads(binningTasks);
            Console.WriteLine("{0} Completed BinCountsForChromosome jobs.", DateTime.Now);
            Console.Out.WriteLine();

            List <GenomicBin> finalBins = new List <GenomicBin>();

            foreach (string chr in chromosomes)
            {
                if (!perChromosomeBins.ContainsKey(chr))
                {
                    continue;
                }
                finalBins.AddRange(perChromosomeBins[chr]);
            }
            return(finalBins);
        }