Beispiel #1
0
        public static List <GenomicBin> ReadFromTextFile(string infile)
        {
            List <GenomicBin> bins = new List <GenomicBin>();

            using (GzipReader reader = new GzipReader(infile))
            {
                string row;

                while ((row = reader.ReadLine()) != null)
                {
                    string[] fields = row.Split('\t');

                    string chr   = fields[0];
                    int    start = Convert.ToInt32(fields[1]);
                    int    stop  = Convert.ToInt32(fields[2]);
                    //int count = Convert.ToInt32(fields[3]);
                    float count = float.Parse(fields[3]);
                    int   gc    = Convert.ToInt32(fields[4]);

                    GenomicBin bin = new GenomicBin(chr, start, stop, gc, count);
                    bins.Add(bin);
                }
            }
            return(bins);
        }
Beispiel #2
0
 public SampleGenomicBin(string chr, int start, int stop, float count)
 {
     _genomicBin           = new GenomicBin();
     GenomicBin.Chromosome = chr;
     GenomicBin.Interval   = new BedInterval(start, stop);
     this.Count            = count;
 }
Beispiel #3
0
        public static List<GenomicBin> ReadFromTextFile(string infile)
        {
            List<GenomicBin> bins = new List<GenomicBin>();

            using (GzipReader reader = new GzipReader(infile))
            {
                string row;

                while ((row = reader.ReadLine()) != null)
                {

                    string[] fields = row.Split('\t');

                    string chr = fields[0];
                    int start = Convert.ToInt32(fields[1]);
                    int stop = Convert.ToInt32(fields[2]);
                    //int count = Convert.ToInt32(fields[3]);
                    float count = float.Parse(fields[3]);
                    int gc = Convert.ToInt32(fields[4]);

                    GenomicBin bin = new GenomicBin(chr, start, stop, gc, count);
                    bins.Add(bin);
                }
            }
            return bins;
        }
Beispiel #4
0
 public SampleGenomicBin(string chr, int start, int stop, int gc)
 {
     _genomicBin           = new GenomicBin();
     GenomicBin.Chromosome = chr;
     GenomicBin.Interval   = new BedInterval(start, stop);
     GenomicBin.GC         = gc;
     this.CountDeviation   = -1;
 }
Beispiel #5
0
 private static Tuple <int, int> GetEndConfidenceInterval(GenomicBin thisBin, GenomicBin nextBin)
 {
     if (nextBin == null || thisBin.Interval.End != nextBin.Interval.Start)
     {
         return(GetConfidenceInterval(thisBin));
     }
     return(Tuple.Create(-GetHalfLength(thisBin), GetHalfLength(nextBin)));
 }
Beispiel #6
0
 private static Tuple <int, int> GetStartConfidenceInterval(GenomicBin thisBin, GenomicBin previousBin)
 {
     if (previousBin == null || previousBin.Interval.End != thisBin.Interval.Start)
     {
         return(GetConfidenceInterval(thisBin));
     }
     return(Tuple.Create(-GetHalfLength(previousBin), GetHalfLength(thisBin)));
 }
Beispiel #7
0
 public SampleGenomicBin()
 {
     _genomicBin           = new GenomicBin();
     GenomicBin.Chromosome = null;
     GenomicBin.Interval   = new GenomicInterval();
     GenomicBin.GC         = -1;
     this.CountDeviation   = -1;
 }
Beispiel #8
0
 public SampleGenomicBin(string chr, int start, int stop, int gc, float count, double MadOfDIffs)
 {
     _genomicBin           = new GenomicBin();
     GenomicBin.Chromosome = chr;
     GenomicBin.Interval   = new BedInterval(start, stop);
     GenomicBin.GC         = gc;
     this.Count            = count;
     this.CountDeviation   = MadOfDIffs;
 }
Beispiel #9
0
 public SampleGenomicBin(string chr, int start, int stop, int gc, float count)
 {
     _genomicBin = new GenomicBin(chr, new GenomicInterval()
     {
         Start = start, End = stop
     }, gc);
     this.Count          = count;
     this.CountDeviation = -1;
 }
Beispiel #10
0
        public static Dictionary <string, List <GenomicBin> > LoadBedFile(string bedPath, int?gcIndex = null)
        {
            Dictionary <string, List <GenomicBin> > excludedIntervals = new Dictionary <string, List <GenomicBin> >();
            int count = 0;

            using (StreamReader reader = new StreamReader(bedPath))
            {
                while (true)
                {
                    string fileLine = reader.ReadLine();
                    if (fileLine == null)
                    {
                        break;
                    }
                    string[] bits = fileLine.Split('\t');
                    string   chr  = bits[0];
                    if (!excludedIntervals.ContainsKey(chr))
                    {
                        excludedIntervals[chr] = new List <GenomicBin>();
                    }
                    GenomicBin interval = new GenomicBin();
                    interval.Chromosome = chr;
                    interval.Start      = int.Parse(bits[1]);
                    interval.Stop       = int.Parse(bits[2]);
                    if (interval.Start < 0)
                    {
                        throw new ApplicationException(String.Format("Start must be non-negative in a BED file: {0}", fileLine));
                    }
                    if (interval.Start >= interval.Stop) // Do not allow empty intervals
                    {
                        throw new ApplicationException(String.Format("Start must be less than Stop in a BED file: {0}", fileLine));
                    }
                    if (gcIndex.HasValue && gcIndex.Value < bits.Length)
                    {
                        interval.GC = int.Parse(bits[gcIndex.Value]);
                    }
                    excludedIntervals[chr].Add(interval);
                    count++;
                }
            }
            Console.WriteLine(">>> Loaded {0} intervals for {1} sequences", count, excludedIntervals.Keys.Count);
            return(excludedIntervals);
        }
Beispiel #11
0
        private static Tuple <int, int> GetConfidenceInterval(GenomicBin bin)
        {
            var halfLength = GetHalfLength(bin);

            return(Tuple.Create(-halfLength, halfLength));
        }
Beispiel #12
0
 public void TestBins()
 {
     GenomicBin bin = new GenomicBin("chr1", 12345, 678910, 20, 100);
     Assert.AreEqual(bin.Size, 666565);
 }
Beispiel #13
0
 private static int GetHalfLength(GenomicBin bin)
 {
     return((int)Math.Round(bin.Interval.Length / 2.0, MidpointRounding.AwayFromZero));
 }
Beispiel #14
0
 public SampleGenomicBin(string chr, int start, int stop, int gc, float count)
 {
     _genomicBin         = new GenomicBin(chr, new BedInterval(start, stop), gc);
     this.Count          = count;
     this.CountDeviation = -1;
 }
Beispiel #15
0
        /// <summary>
        /// Populate the list of GenomicBin objects for this chromosome.  
        /// </summary>
        static void BinCountsForChromosome(BinTaskArguments arguments)
        {
            List<GenomicBin> bins = arguments.Bins;
            bool usePredefinedBins = bins.Any();
            int predefinedBinIndex = 0;
            GenericRead fastaEntry = arguments.FastaEntry; //fastaEntryKVP.Value;
            BinState currentBin = new BinState();
            string chr = arguments.Chromosome;
            BitArray possibleAlignments = arguments.PossibleAlignments;
            HitArray observedAlignments = arguments.ObservedAlignments;
            CanvasCoverageMode coverageMode = arguments.CoverageMode;
            int pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0;

            // Skip past leading Ns
            while (fastaEntry.Bases[pos].Equals('n'))
                pos++;
            List<float> binPositions = new List<float>();
            List<int> binObservations = new List<int>();
            for (; pos < fastaEntry.Bases.Length; pos++)
            {
                // Sets the start of the bin
                if (currentBin.StartPosition == -1)
                    currentBin.StartPosition = pos;

                if (!fastaEntry.Bases[pos].Equals("n"))
                    currentBin.NucleotideCount++;


                //if (IsGC(fastaEntry.Bases[pos]))
                //    currentBin.GCCount++;
                switch (fastaEntry.Bases[pos])
                {
                    case 'C':
                    case 'c':
                    case 'G':
                    case 'g':
                        currentBin.GCCount++;
                        break;

                }

                if (possibleAlignments[pos])
                {
                    currentBin.PossibleCount++;
                    currentBin.ObservedCount += observedAlignments.Data[pos];
                    binObservations.Add(observedAlignments.Data[pos]);
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted)
                        binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]);
                }

                // We've seen the desired number of possible alignment positions.
                if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize)
                    || (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1))
                {
                    if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range
                    {
                        currentBin.ObservedCount = 0;
                        foreach (int Value in binObservations)
                        {
                            currentBin.ObservedCount += Math.Min(10, Value);
                        }
                    }
                    if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted 
                    {
                        currentBin.ObservedCount = 0;
                        float tmpObservedCount = 0;
                        for (int i = 0; i < binObservations.Count; i++)
                        {
                            tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]);
                        }
                        currentBin.ObservedCount = (int)Math.Round(tmpObservedCount);

                    }

                    int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount);

                    if (usePredefinedBins)
                    {
                        bins[predefinedBinIndex].GC = gc;
                        bins[predefinedBinIndex].Count = currentBin.ObservedCount;
                        predefinedBinIndex++;
                        if (predefinedBinIndex >= bins.Count) { break; } // we have processed all the bins
                        pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin
                    }
                    else
                    {
                        // Note the pos + 1 to make the first three conform to bed specification
                        GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount);
                        bins.Add(bin);
                    }

                    // Reset all relevant variables
                    currentBin.Reset();
                    binObservations.Clear();
                    binPositions.Clear();
                }
            }
        }
Beispiel #16
0
        /// <summary>
        /// Read predefined bins from a BED file. Assume the bins are sorted by genomic coordinates.
        /// </summary>
        /// <param name="predefinedBinsFile">input BED file</param>
        /// <returns>predefined bins by chromosome</returns>
        static Dictionary<string, List<GenomicBin>> ReadPredefinedBins(string predefinedBinsFile)
        {
            Dictionary<string, List<GenomicBin>> predefinedBins = new Dictionary<string, List<GenomicBin>>();
            if (!File.Exists(predefinedBinsFile)) { return predefinedBins; }

            using (StreamReader reader = new StreamReader(predefinedBinsFile))
            {
                string row;

                while ((row = reader.ReadLine()) != null)
                {
                    try
                    {
                        if (row.StartsWith("#")) { continue; } // ignore comments
                        string[] fields = row.Split('\t');
                        if (fields.Length < 3) { continue; }

                        string chr = fields[0];
                        int start = Convert.ToInt32(fields[1]);
                        int stop = Convert.ToInt32(fields[2]);
                        GenomicBin bin = new GenomicBin(chr, start, stop, 0, 0);
                        if (!predefinedBins.ContainsKey(chr)) { predefinedBins[chr] = new List<GenomicBin>(); }
                        predefinedBins[chr].Add(bin);
                    }
                    catch (Exception e)
                    {
                        throw new Exception(String.Format("Failed to parse {0}; Line: {1}", predefinedBinsFile, row), e);
                    }
                }
            }

            return predefinedBins;
        }
Beispiel #17
0
 public MultiSampleGenomicBin(GenomicBin genomicBin, List <float> counts)
 {
     Bin    = genomicBin;
     Counts = counts;
 }
Beispiel #18
0
 static public Dictionary<string, List<GenomicBin>> LoadBedFile(string bedPath)
 {
     Dictionary<string, List<GenomicBin>> excludedIntervals = new Dictionary<string, List<GenomicBin>>();
     int count = 0;
     using (StreamReader reader = new StreamReader(bedPath))
     {
         while (true)
         {
             string fileLine = reader.ReadLine();
             if (fileLine == null) break;
             string[] bits = fileLine.Split('\t');
             string chr = bits[0];
             if (!excludedIntervals.ContainsKey(chr)) excludedIntervals[chr] = new List<GenomicBin>();
             GenomicBin interval = new GenomicBin();
             interval.Start = int.Parse(bits[1]);
             interval.Stop = int.Parse(bits[2]);
             excludedIntervals[chr].Add(interval);
             count++;
         }
     }
     Console.WriteLine(">>> Loaded {0} excluded intervals for {1} sequences", count, excludedIntervals.Keys.Count);
     return excludedIntervals;
 }
Beispiel #19
0
        /// <summary>
        /// Loads .cleaned bed files, merges bins from multiple samples and returns GenomicBin objects with MultiSampleCount
        /// </summary>
        public static Dictionary <string, List <MultiSampleGenomicBin> > MergeMultiSampleCleanedBedFile(List <IFileLocation> canvasCleanBedPaths)
        {
            // initialize variables to hold multi-sample bed files
            Dictionary <string, List <MultiSampleGenomicBin> >    multiSampleGenomicBins = new Dictionary <string, List <MultiSampleGenomicBin> >();
            Dictionary <string, Dictionary <int, int> >           start     = new Dictionary <string, Dictionary <int, int> >();
            Dictionary <string, Dictionary <int, int> >           stop      = new Dictionary <string, Dictionary <int, int> >();
            Dictionary <string, Dictionary <int, List <float> > > binCounts = new Dictionary <string, Dictionary <int, List <float> > >();
            List <int>       counts      = new List <int>();
            HashSet <string> chromosomes = new HashSet <string>();

            Console.WriteLine("Merge and normalize CanvasClean bed files");

            foreach (IFileLocation bedPath in canvasCleanBedPaths)
            {
                int count = 0;
                using (GzipReader reader = new GzipReader(bedPath.FullName))
                {
                    while (true)
                    {
                        string fileLine = reader.ReadLine();
                        if (fileLine == null)
                        {
                            break;
                        }
                        string[] lineBedFile = fileLine.Split('\t');
                        string   chr         = lineBedFile[0];
                        if (!chromosomes.Contains(chr))
                        {
                            chromosomes.Add(chr);
                        }
                        count++;
                    }
                }
                counts.Add(count);
                Console.WriteLine($"count {count}");
            }
            foreach (string chr in chromosomes)
            {
                start[chr]     = new Dictionary <int, int>();
                stop[chr]      = new Dictionary <int, int>();
                binCounts[chr] = new Dictionary <int, List <float> >();
            }

            // read counts and segmentIDs
            foreach (IFileLocation bedPath in canvasCleanBedPaths)
            {
                Console.WriteLine(bedPath);

                using (GzipReader reader = new GzipReader(bedPath.FullName))
                {
                    while (true)
                    {
                        string fileLine = reader.ReadLine();
                        if (fileLine == null)
                        {
                            break;
                        }
                        string[] lineBedFile = fileLine.Split('\t');
                        string   chr         = lineBedFile[0];
                        int      pos         = int.Parse(lineBedFile[1]);
                        start[chr][pos] = pos;
                        stop[chr][pos]  = int.Parse(lineBedFile[2]);

                        if (binCounts[chr].ContainsKey(pos))
                        {
                            binCounts[chr][pos].Add(float.Parse(lineBedFile[3]));
                        }
                        else
                        {
                            binCounts[chr][pos] = new List <float> {
                                float.Parse(lineBedFile[3])
                            }
                        };
                    }
                }
            }
            Console.WriteLine("create GenomeBin intervals");

            // create GenomeBin intervals

            foreach (string chr in chromosomes)
            {
                if (!multiSampleGenomicBins.ContainsKey(chr))
                {
                    multiSampleGenomicBins[chr] = new List <MultiSampleGenomicBin>();
                }
                var binStartPositions = start[chr].Keys.ToList();
                foreach (var binStartPosition in binStartPositions)
                {
                    // if outlier is removed in one sample, remove it in all samples
                    if (binCounts[chr][binStartPosition].Count < canvasCleanBedPaths.Count)
                    {
                        continue;
                    }
                    if (binStartPosition < 0)
                    {
                        throw new Illumina.Common.IlluminaException($"Start must be non-negative");
                    }
                    if (binStartPosition >= stop[chr][binStartPosition]) // Do not allow empty intervals
                    {
                        throw new Illumina.Common.IlluminaException($"Start must be less than Stop");
                    }
                    GenomicBin interval = new GenomicBin(chr, new BedInterval(binStartPosition, stop[chr][binStartPosition]));
                    multiSampleGenomicBins[chr].Add(new MultiSampleGenomicBin(interval, binCounts[chr][binStartPosition]));
                }
            }
            return(multiSampleGenomicBins);
        }