public static List <GenomicBin> ReadFromTextFile(string infile) { List <GenomicBin> bins = new List <GenomicBin>(); using (GzipReader reader = new GzipReader(infile)) { string row; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); string chr = fields[0]; int start = Convert.ToInt32(fields[1]); int stop = Convert.ToInt32(fields[2]); //int count = Convert.ToInt32(fields[3]); float count = float.Parse(fields[3]); int gc = Convert.ToInt32(fields[4]); GenomicBin bin = new GenomicBin(chr, start, stop, gc, count); bins.Add(bin); } } return(bins); }
public SampleGenomicBin(string chr, int start, int stop, float count) { _genomicBin = new GenomicBin(); GenomicBin.Chromosome = chr; GenomicBin.Interval = new BedInterval(start, stop); this.Count = count; }
public static List<GenomicBin> ReadFromTextFile(string infile) { List<GenomicBin> bins = new List<GenomicBin>(); using (GzipReader reader = new GzipReader(infile)) { string row; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); string chr = fields[0]; int start = Convert.ToInt32(fields[1]); int stop = Convert.ToInt32(fields[2]); //int count = Convert.ToInt32(fields[3]); float count = float.Parse(fields[3]); int gc = Convert.ToInt32(fields[4]); GenomicBin bin = new GenomicBin(chr, start, stop, gc, count); bins.Add(bin); } } return bins; }
public SampleGenomicBin(string chr, int start, int stop, int gc) { _genomicBin = new GenomicBin(); GenomicBin.Chromosome = chr; GenomicBin.Interval = new BedInterval(start, stop); GenomicBin.GC = gc; this.CountDeviation = -1; }
private static Tuple <int, int> GetEndConfidenceInterval(GenomicBin thisBin, GenomicBin nextBin) { if (nextBin == null || thisBin.Interval.End != nextBin.Interval.Start) { return(GetConfidenceInterval(thisBin)); } return(Tuple.Create(-GetHalfLength(thisBin), GetHalfLength(nextBin))); }
private static Tuple <int, int> GetStartConfidenceInterval(GenomicBin thisBin, GenomicBin previousBin) { if (previousBin == null || previousBin.Interval.End != thisBin.Interval.Start) { return(GetConfidenceInterval(thisBin)); } return(Tuple.Create(-GetHalfLength(previousBin), GetHalfLength(thisBin))); }
public SampleGenomicBin() { _genomicBin = new GenomicBin(); GenomicBin.Chromosome = null; GenomicBin.Interval = new GenomicInterval(); GenomicBin.GC = -1; this.CountDeviation = -1; }
public SampleGenomicBin(string chr, int start, int stop, int gc, float count, double MadOfDIffs) { _genomicBin = new GenomicBin(); GenomicBin.Chromosome = chr; GenomicBin.Interval = new BedInterval(start, stop); GenomicBin.GC = gc; this.Count = count; this.CountDeviation = MadOfDIffs; }
public SampleGenomicBin(string chr, int start, int stop, int gc, float count) { _genomicBin = new GenomicBin(chr, new GenomicInterval() { Start = start, End = stop }, gc); this.Count = count; this.CountDeviation = -1; }
public static Dictionary <string, List <GenomicBin> > LoadBedFile(string bedPath, int?gcIndex = null) { Dictionary <string, List <GenomicBin> > excludedIntervals = new Dictionary <string, List <GenomicBin> >(); int count = 0; using (StreamReader reader = new StreamReader(bedPath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } string[] bits = fileLine.Split('\t'); string chr = bits[0]; if (!excludedIntervals.ContainsKey(chr)) { excludedIntervals[chr] = new List <GenomicBin>(); } GenomicBin interval = new GenomicBin(); interval.Chromosome = chr; interval.Start = int.Parse(bits[1]); interval.Stop = int.Parse(bits[2]); if (interval.Start < 0) { throw new ApplicationException(String.Format("Start must be non-negative in a BED file: {0}", fileLine)); } if (interval.Start >= interval.Stop) // Do not allow empty intervals { throw new ApplicationException(String.Format("Start must be less than Stop in a BED file: {0}", fileLine)); } if (gcIndex.HasValue && gcIndex.Value < bits.Length) { interval.GC = int.Parse(bits[gcIndex.Value]); } excludedIntervals[chr].Add(interval); count++; } } Console.WriteLine(">>> Loaded {0} intervals for {1} sequences", count, excludedIntervals.Keys.Count); return(excludedIntervals); }
private static Tuple <int, int> GetConfidenceInterval(GenomicBin bin) { var halfLength = GetHalfLength(bin); return(Tuple.Create(-halfLength, halfLength)); }
public void TestBins() { GenomicBin bin = new GenomicBin("chr1", 12345, 678910, 20, 100); Assert.AreEqual(bin.Size, 666565); }
private static int GetHalfLength(GenomicBin bin) { return((int)Math.Round(bin.Interval.Length / 2.0, MidpointRounding.AwayFromZero)); }
public SampleGenomicBin(string chr, int start, int stop, int gc, float count) { _genomicBin = new GenomicBin(chr, new BedInterval(start, stop), gc); this.Count = count; this.CountDeviation = -1; }
/// <summary> /// Populate the list of GenomicBin objects for this chromosome. /// </summary> static void BinCountsForChromosome(BinTaskArguments arguments) { List<GenomicBin> bins = arguments.Bins; bool usePredefinedBins = bins.Any(); int predefinedBinIndex = 0; GenericRead fastaEntry = arguments.FastaEntry; //fastaEntryKVP.Value; BinState currentBin = new BinState(); string chr = arguments.Chromosome; BitArray possibleAlignments = arguments.PossibleAlignments; HitArray observedAlignments = arguments.ObservedAlignments; CanvasCoverageMode coverageMode = arguments.CoverageMode; int pos = usePredefinedBins ? bins[predefinedBinIndex].Start : 0; // Skip past leading Ns while (fastaEntry.Bases[pos].Equals('n')) pos++; List<float> binPositions = new List<float>(); List<int> binObservations = new List<int>(); for (; pos < fastaEntry.Bases.Length; pos++) { // Sets the start of the bin if (currentBin.StartPosition == -1) currentBin.StartPosition = pos; if (!fastaEntry.Bases[pos].Equals("n")) currentBin.NucleotideCount++; //if (IsGC(fastaEntry.Bases[pos])) // currentBin.GCCount++; switch (fastaEntry.Bases[pos]) { case 'C': case 'c': case 'G': case 'g': currentBin.GCCount++; break; } if (possibleAlignments[pos]) { currentBin.PossibleCount++; currentBin.ObservedCount += observedAlignments.Data[pos]; binObservations.Add(observedAlignments.Data[pos]); if (coverageMode == CanvasCoverageMode.GCContentWeighted) binPositions.Add(arguments.ObservedVsExpectedGC[arguments.ReadGCContent[pos]]); } // We've seen the desired number of possible alignment positions. if ((!usePredefinedBins && currentBin.PossibleCount == arguments.BinSize) || (usePredefinedBins && pos == bins[predefinedBinIndex].Stop - 1)) { if (coverageMode == CanvasCoverageMode.TruncatedDynamicRange) // Truncated dynamic range { currentBin.ObservedCount = 0; foreach (int Value in binObservations) { currentBin.ObservedCount += Math.Min(10, Value); } } if (coverageMode == CanvasCoverageMode.GCContentWeighted) // read GC content weighted { currentBin.ObservedCount = 0; float tmpObservedCount = 0; for (int i = 0; i < binObservations.Count; i++) { tmpObservedCount += Math.Min(10, (float)binObservations[i] / binPositions[i]); } currentBin.ObservedCount = (int)Math.Round(tmpObservedCount); } int gc = (int)(100 * currentBin.GCCount / currentBin.NucleotideCount); if (usePredefinedBins) { bins[predefinedBinIndex].GC = gc; bins[predefinedBinIndex].Count = currentBin.ObservedCount; predefinedBinIndex++; if (predefinedBinIndex >= bins.Count) { break; } // we have processed all the bins pos = bins[predefinedBinIndex].Start - 1; // jump to right before the next predefined bin } else { // Note the pos + 1 to make the first three conform to bed specification GenomicBin bin = new GenomicBin(chr, currentBin.StartPosition, pos + 1, gc, currentBin.ObservedCount); bins.Add(bin); } // Reset all relevant variables currentBin.Reset(); binObservations.Clear(); binPositions.Clear(); } } }
/// <summary> /// Read predefined bins from a BED file. Assume the bins are sorted by genomic coordinates. /// </summary> /// <param name="predefinedBinsFile">input BED file</param> /// <returns>predefined bins by chromosome</returns> static Dictionary<string, List<GenomicBin>> ReadPredefinedBins(string predefinedBinsFile) { Dictionary<string, List<GenomicBin>> predefinedBins = new Dictionary<string, List<GenomicBin>>(); if (!File.Exists(predefinedBinsFile)) { return predefinedBins; } using (StreamReader reader = new StreamReader(predefinedBinsFile)) { string row; while ((row = reader.ReadLine()) != null) { try { if (row.StartsWith("#")) { continue; } // ignore comments string[] fields = row.Split('\t'); if (fields.Length < 3) { continue; } string chr = fields[0]; int start = Convert.ToInt32(fields[1]); int stop = Convert.ToInt32(fields[2]); GenomicBin bin = new GenomicBin(chr, start, stop, 0, 0); if (!predefinedBins.ContainsKey(chr)) { predefinedBins[chr] = new List<GenomicBin>(); } predefinedBins[chr].Add(bin); } catch (Exception e) { throw new Exception(String.Format("Failed to parse {0}; Line: {1}", predefinedBinsFile, row), e); } } } return predefinedBins; }
public MultiSampleGenomicBin(GenomicBin genomicBin, List <float> counts) { Bin = genomicBin; Counts = counts; }
static public Dictionary<string, List<GenomicBin>> LoadBedFile(string bedPath) { Dictionary<string, List<GenomicBin>> excludedIntervals = new Dictionary<string, List<GenomicBin>>(); int count = 0; using (StreamReader reader = new StreamReader(bedPath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) break; string[] bits = fileLine.Split('\t'); string chr = bits[0]; if (!excludedIntervals.ContainsKey(chr)) excludedIntervals[chr] = new List<GenomicBin>(); GenomicBin interval = new GenomicBin(); interval.Start = int.Parse(bits[1]); interval.Stop = int.Parse(bits[2]); excludedIntervals[chr].Add(interval); count++; } } Console.WriteLine(">>> Loaded {0} excluded intervals for {1} sequences", count, excludedIntervals.Keys.Count); return excludedIntervals; }
/// <summary> /// Loads .cleaned bed files, merges bins from multiple samples and returns GenomicBin objects with MultiSampleCount /// </summary> public static Dictionary <string, List <MultiSampleGenomicBin> > MergeMultiSampleCleanedBedFile(List <IFileLocation> canvasCleanBedPaths) { // initialize variables to hold multi-sample bed files Dictionary <string, List <MultiSampleGenomicBin> > multiSampleGenomicBins = new Dictionary <string, List <MultiSampleGenomicBin> >(); Dictionary <string, Dictionary <int, int> > start = new Dictionary <string, Dictionary <int, int> >(); Dictionary <string, Dictionary <int, int> > stop = new Dictionary <string, Dictionary <int, int> >(); Dictionary <string, Dictionary <int, List <float> > > binCounts = new Dictionary <string, Dictionary <int, List <float> > >(); List <int> counts = new List <int>(); HashSet <string> chromosomes = new HashSet <string>(); Console.WriteLine("Merge and normalize CanvasClean bed files"); foreach (IFileLocation bedPath in canvasCleanBedPaths) { int count = 0; using (GzipReader reader = new GzipReader(bedPath.FullName)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } string[] lineBedFile = fileLine.Split('\t'); string chr = lineBedFile[0]; if (!chromosomes.Contains(chr)) { chromosomes.Add(chr); } count++; } } counts.Add(count); Console.WriteLine($"count {count}"); } foreach (string chr in chromosomes) { start[chr] = new Dictionary <int, int>(); stop[chr] = new Dictionary <int, int>(); binCounts[chr] = new Dictionary <int, List <float> >(); } // read counts and segmentIDs foreach (IFileLocation bedPath in canvasCleanBedPaths) { Console.WriteLine(bedPath); using (GzipReader reader = new GzipReader(bedPath.FullName)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } string[] lineBedFile = fileLine.Split('\t'); string chr = lineBedFile[0]; int pos = int.Parse(lineBedFile[1]); start[chr][pos] = pos; stop[chr][pos] = int.Parse(lineBedFile[2]); if (binCounts[chr].ContainsKey(pos)) { binCounts[chr][pos].Add(float.Parse(lineBedFile[3])); } else { binCounts[chr][pos] = new List <float> { float.Parse(lineBedFile[3]) } }; } } } Console.WriteLine("create GenomeBin intervals"); // create GenomeBin intervals foreach (string chr in chromosomes) { if (!multiSampleGenomicBins.ContainsKey(chr)) { multiSampleGenomicBins[chr] = new List <MultiSampleGenomicBin>(); } var binStartPositions = start[chr].Keys.ToList(); foreach (var binStartPosition in binStartPositions) { // if outlier is removed in one sample, remove it in all samples if (binCounts[chr][binStartPosition].Count < canvasCleanBedPaths.Count) { continue; } if (binStartPosition < 0) { throw new Illumina.Common.IlluminaException($"Start must be non-negative"); } if (binStartPosition >= stop[chr][binStartPosition]) // Do not allow empty intervals { throw new Illumina.Common.IlluminaException($"Start must be less than Stop"); } GenomicBin interval = new GenomicBin(chr, new BedInterval(binStartPosition, stop[chr][binStartPosition])); multiSampleGenomicBins[chr].Add(new MultiSampleGenomicBin(interval, binCounts[chr][binStartPosition])); } } return(multiSampleGenomicBins); }