Esempio n. 1
0
        /// <summary>
        /// Loop over variants like this: foreach (VcfVariant variant in reader.GetVariants())
        /// </summary>
        public IEnumerable <VcfVariant> GetVariants()
        {
            // sanity check: make sure the file is open
            if (!IsOpen)
            {
                yield break;
            }

            while (true)
            {
                // grab the next vcf line
                string line = Reader.ReadLine();
                if (line == null)
                {
                    break;
                }

                VcfVariant variant = new VcfVariant();

                // split the columns and assign them to VcfVariant
                string[] cols = line.Split('\t');

                // convert the columns to a variant
                ConvertColumnsToVariant(cols, variant);
                if (RequireGenotypes && (variant.Genotypes == null || variant.Genotypes.Count == 0))
                {
                    throw new InvalidDataException("Missing genotype columns in VCF file");
                }
                yield return(variant);
            }
        }
Esempio n. 2
0
        public static List <GenomicBin> ReadFromTextFile(string infile)
        {
            List <GenomicBin> bins = new List <GenomicBin>();

            using (GzipReader reader = new GzipReader(infile))
            {
                string row;

                while ((row = reader.ReadLine()) != null)
                {
                    string[] fields = row.Split('\t');

                    string chr   = fields[0];
                    int    start = Convert.ToInt32(fields[1]);
                    int    stop  = Convert.ToInt32(fields[2]);
                    //int count = Convert.ToInt32(fields[3]);
                    float count = float.Parse(fields[3]);
                    int   gc    = Convert.ToInt32(fields[4]);

                    GenomicBin bin = new GenomicBin(chr, start, stop, gc, count);
                    bins.Add(bin);
                }
            }
            return(bins);
        }
Esempio n. 3
0
        private static void LoadBinCounts(string binnedPath, NexteraManifest manifest, out List <double> binCounts,
                                          out List <int> onTargetIndices)
        {
            binCounts       = new List <double>();
            onTargetIndices = new List <int>();

            var    regionsByChrom = manifest.GetManifestRegionsByChromosome();
            string currChrom      = null;
            List <NexteraManifest.ManifestRegion> regions = null; // 1-based regions
            int  regionIndex = -1;
            bool onTarget    = false;

            using (GzipReader reader = new GzipReader(binnedPath))
            {
                string   line;
                string[] toks;
                int      binIdx = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    toks = line.Split('\t');
                    string chrom = toks[0];
                    int    start = int.Parse(toks[1]); // 0-based, inclusive
                    int    stop  = int.Parse(toks[2]); // 0-based, exclusive
                    if (currChrom != chrom)
                    {
                        currChrom = chrom;
                        onTarget  = false;
                        if (!regionsByChrom.ContainsKey(currChrom))
                        {
                            regions = null;
                        }
                        else
                        {
                            regions     = regionsByChrom[currChrom];
                            regionIndex = 0;
                        }
                    }
                    while (regions != null && regionIndex < regions.Count && regions[regionIndex].End < start + 1)
                    {
                        regionIndex++;
                    }
                    if (regions != null && regionIndex < regions.Count && regions[regionIndex].Start <= stop) // overlap
                    {
                        onTarget = true;
                    }
                    else
                    {
                        onTarget = false;
                    }

                    if (onTarget)
                    {
                        onTargetIndices.Add(binIdx);
                    }

                    binCounts.Add(double.Parse(toks[3]));
                    binIdx++;
                }
            }
        }
Esempio n. 4
0
        protected static Dictionary <string, List <CNInterval> > LoadKnownCNVCF(string oracleVcfPath)
        {
            var knownCn = new Dictionary <string, List <CNInterval> >();
            // Load our "oracle" of known copy numbers:
            int count = 0;

            using (GzipReader reader = new GzipReader(oracleVcfPath))
            {
                while (true)
                {
                    string fileLine = reader.ReadLine();
                    if (fileLine == null)
                    {
                        break;
                    }
                    if (fileLine.Length == 0 || fileLine[0] == '#')
                    {
                        continue;
                    }
                    var interval = ParseCnInterval(oracleVcfPath, fileLine);

                    if (!knownCn.ContainsKey(interval.Chromosome))
                    {
                        knownCn[interval.Chromosome] = new List <CNInterval>();
                    }
                    knownCn[interval.Chromosome].Add(interval);
                    count++;
                }
            }
            Console.WriteLine(">>>Loaded {0} known-CN intervals", count);
            return(knownCn);
        }
Esempio n. 5
0
        private static void GetWeightedAverageBinCount(IEnumerable <string> binnedPaths, string mergedBinnedPath,
                                                       NexteraManifest manifest = null)
        {
            int sampleCount = binnedPaths.Count();

            if (sampleCount == 1) // copy file
            {
                if (File.Exists(binnedPaths.First()))
                {
                    if (File.Exists(mergedBinnedPath))
                    {
                        File.Delete(mergedBinnedPath);
                    }
                    File.Copy(binnedPaths.First(), mergedBinnedPath);
                }
            }
            else // merge normal samples
            {
                double[]        weights           = new double[sampleCount];
                List <double>[] binCountsBySample = new List <double> [sampleCount];
                for (int sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++)
                {
                    string        binnedPath = binnedPaths.ElementAt(sampleIndex);
                    var           binCounts  = new BinCounts(binnedPath, manifest: manifest);
                    List <double> counts     = binCounts.AllCounts;
                    // If a manifest is available, get the median of bins overlapping the targeted regions only.
                    // For small panels, there could be a lot of bins with zero count and the median would be 0 if taken over all the bins, resulting in division by zero.
                    double median = binCounts.OnTargetMedianBinCount;
                    weights[sampleIndex]           = median > 0 ? 1.0 / median : 0;
                    binCountsBySample[sampleIndex] = counts;
                }
                double weightSum = weights.Sum();
                for (int i = 0; i < sampleCount; i++)
                {
                    weights[i] /= weightSum;
                }                                                                  // so weights sum to 1

                // Computed weighted average of bin counts across samples
                using (GzipReader reader = new GzipReader(binnedPaths.First()))
                    using (GzipWriter writer = new GzipWriter(mergedBinnedPath))
                    {
                        string   line;
                        string[] toks;
                        int      lineIdx = 0;
                        while ((line = reader.ReadLine()) != null)
                        {
                            toks = line.Split('\t');
                            double weightedBinCount = 0;
                            for (int i = 0; i < sampleCount; i++)
                            {
                                weightedBinCount += weights[i] * binCountsBySample[i][lineIdx];
                            }
                            toks[3] = String.Format("{0}", weightedBinCount);
                            writer.WriteLine(String.Join("\t", toks));
                            lineIdx++;
                        }
                    }
            }
        }
Esempio n. 6
0
            private static void LoadModel(IFileLocation modelFile, out List <SampleGenomicBin> mu, out List <double[]> axes)
            {
                mu   = new List <SampleGenomicBin>();
                axes = new List <double[]>();
                List <List <double> > tempAxes = new List <List <double> >();

                using (GzipReader reader = new GzipReader(modelFile.FullName))
                {
                    string line = reader.ReadLine();
                    for (int i = 0; i < line.Split('\t').Length - 4; i++) // initialize axes
                    {
                        tempAxes.Add(new List <double>());
                    }

                    while (line != null)
                    {
                        string[] toks  = line.Split('\t');
                        string   chrom = toks[0];
                        int      start = int.Parse(toks[1]);
                        int      stop  = int.Parse(toks[2]);
                        float    mean  = float.Parse(toks[3]);
                        mu.Add(new SampleGenomicBin(chrom, start, stop, -1, mean));
                        for (int i = 0; i < tempAxes.Count; i++)
                        {
                            tempAxes[i].Add(double.Parse(toks[i + 4]));
                        }
                        line = reader.ReadLine();
                    }
                }

                foreach (var axis in tempAxes)
                {
                    axes.Add(CanvasCommon.Utilities.NormalizeBy2Norm(axis.ToArray()));
                }

                if (!AreOrthogonal(axes))
                {
                    throw new Illumina.Common.IlluminaException(String.Format("Axes are not orthogonal to each other in {0}.",
                                                                              modelFile.FullName));
                }
            }
Esempio n. 7
0
        /// <summary>
        /// Loads in data produced by CanvasPartition.exe.
        /// </summary>
        /// <param name="infile">Input file.</param>
        /// <returns>A list of segments.</returns>
        public static List <CanvasSegment> ReadSegments(string infile)
        {
            Console.WriteLine("{0} Read segments from {1}", DateTime.Now, infile);
            List <CanvasSegment> segments = new List <CanvasSegment>();

            string       chr    = null;
            int          begin  = -1;
            int          end    = -1;
            int          bin    = -1;
            List <float> counts = new List <float>();

            using (GzipReader reader = new GzipReader(infile))
            {
                string row = null;

                while ((row = reader.ReadLine()) != null)
                {
                    string[] fields = row.Split('\t');

                    int currentBin = Convert.ToInt32(fields[4]);

                    // We've moved to a new segment
                    if (currentBin != bin)
                    {
                        // Make a segment
                        if (bin != -1)
                        {
                            CanvasSegment segment = new CanvasSegment(chr, begin, end, counts);
                            segments.Add(segment);
                            counts.Clear();
                        }

                        chr   = fields[0];
                        begin = Convert.ToInt32(fields[1]);
                        bin   = currentBin;
                    }

                    end = Convert.ToInt32(fields[2]);
                    counts.Add(float.Parse(fields[3]));
                }

                if (bin != -1)
                {
                    // Add the last segment
                    CanvasSegment segment = new CanvasSegment(chr, begin, end, counts);
                    segments.Add(segment);
                }
            }
            Console.WriteLine("{0} Loaded {1} segments", DateTime.Now, segments.Count);
            return(segments);
        }
Esempio n. 8
0
        /// <summary>
        /// Loop over variants like this: foreach (VcfVariant variant in reader.GetVariants())
        /// </summary>
        public IEnumerable <CalledAllele> GetVariants()
        {
            // sanity check: make sure the file is open
            if (!_IsOpen)
            {
                yield break;
            }

            while (true)
            {
                // grab the next vcf line
                string line = _Reader.ReadLine();
                if (line == null)
                {
                    break;
                }

                // split the columns and assign them to VcfVariant
                string[] cols           = line.Split('\t');
                var      allelesStrings = cols[VcfCommon.AltIndex].Split(',');

                var numAlleles = allelesStrings.Length;

                for (int index = 0; index < allelesStrings.Length; index++)
                {
                    CalledAllele variant = new CalledAllele();

                    // convert the columns to a variant
                    ConvertColumnsToVariant(_shouldTrimComplexAlleles, cols, variant, index);

                    if (!IsPlaceHolderAllele(variant))
                    {
                        yield return(variant);
                    }
                }
            }
        }
Esempio n. 9
0
        private static void LoadBinCounts(string binnedPath, out List <double> binCounts)
        {
            binCounts = new List <double>();

            using (GzipReader reader = new GzipReader(binnedPath))
            {
                string   line;
                string[] toks;
                while ((line = reader.ReadLine()) != null)
                {
                    toks = line.Split('\t');
                    binCounts.Add(double.Parse(toks[3]));
                }
            }
        }
Esempio n. 10
0
        /// <summary>
        /// Assume that the rows are sorted by the start position and ascending order
        /// </summary>
        private void ReadBEDInput()
        {
            GenomicBinFilter binFilter = new GenomicBinFilter(ForbiddenIntervalBedPath);

            try
            {
                Dictionary <string, List <uint> > startByChr   = new Dictionary <string, List <uint> >(),
                                                  endByChr     = new Dictionary <string, List <uint> >();
                Dictionary <string, List <double> > scoreByChr = new Dictionary <string, List <double> >();
                using (GzipReader reader = new GzipReader(this.InputBinPath))
                {
                    string   line;
                    string[] tokens;
                    while ((line = reader.ReadLine()) != null)
                    {
                        tokens = line.Split('\t');
                        string chrom = tokens[Segmentation.idxChr].Trim();
                        uint   start = Convert.ToUInt32(tokens[Segmentation.idxStart].Trim());
                        uint   end   = Convert.ToUInt32(tokens[Segmentation.idxEnd].Trim());
                        if (binFilter.SkipBin(chrom, start, end))
                        {
                            continue;
                        }
                        if (!startByChr.ContainsKey(chrom))
                        {
                            startByChr.Add(chrom, new List <uint>());
                            endByChr.Add(chrom, new List <uint>());
                            scoreByChr.Add(chrom, new List <double>());
                        }
                        startByChr[chrom].Add(start);
                        endByChr[chrom].Add(end);
                        scoreByChr[chrom].Add(Convert.ToDouble(tokens[this.idxScore].Trim()));
                    }
                    foreach (string chr in startByChr.Keys)
                    {
                        this.StartByChr[chr] = startByChr[chr].ToArray();
                        this.EndByChr[chr]   = endByChr[chr].ToArray();
                        this.ScoreByChr[chr] = scoreByChr[chr].ToArray();
                    }
                }
            }
            catch (Exception e)
            {
                Console.Error.WriteLine("File {0} could not be read:", this.InputBinPath);
                Console.Error.WriteLine(e.Message);
                Environment.Exit(1);
            }
        }
Esempio n. 11
0
        public static PloidyInfo LoadPloidyFromBedFile(string filePath)
        {
            PloidyInfo ploidy = new PloidyInfo();

            if (string.IsNullOrEmpty(filePath))
            {
                return(ploidy);
            }
            int count = 0;

            using (GzipReader reader = new GzipReader(filePath))
            {
                while (true)
                {
                    string fileLine = reader.ReadLine();
                    if (fileLine == null)
                    {
                        break;
                    }
                    // save anything that looks like a vcf header line (we will add it to the output vcf)
                    // TODO: support adding multiple header lines to the output vcf
                    if (fileLine.StartsWith("##"))
                    {
                        ploidy.HeaderLine = fileLine.Trim();
                        continue;
                    }
                    if (fileLine.Length == 0 || fileLine[0] == '#')
                    {
                        continue;
                    }
                    string[] bits       = fileLine.Split('\t');
                    string   chromosome = bits[0];
                    if (!ploidy.PloidyByChromosome.ContainsKey(chromosome))
                    {
                        ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>();
                    }
                    PloidyInterval interval = new PloidyInterval(chromosome);
                    interval.Start  = int.Parse(bits[1]);
                    interval.End    = int.Parse(bits[2]);
                    interval.Ploidy = int.Parse(bits[4]);
                    ploidy.PloidyByChromosome[chromosome].Add(interval);
                    count++;
                }
            }
            Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count);
            return(ploidy);
        }
Esempio n. 12
0
        private static void GetBinRatio(string tumorBinnedPath, string normalBinnedPath, string ratioBinnedPath,
                                        string ploidyBedPath, NexteraManifest manifest = null)
        {
            PloidyInfo referencePloidy   = String.IsNullOrEmpty(ploidyBedPath) ? null : PloidyInfo.LoadPloidyFromBedFile(ploidyBedPath);
            double     tumorMedian       = (new BinCounts(tumorBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double     normalMedian      = (new BinCounts(normalBinnedPath, manifest: manifest)).OnTargetMedianBinCount;
            double     librarySizeFactor = (tumorMedian > 0 && normalMedian > 0) ? normalMedian / tumorMedian : 1;

            using (GzipReader tumorReader = new GzipReader(tumorBinnedPath))
                using (GzipReader normalReader = new GzipReader(normalBinnedPath))
                    using (GzipWriter writer = new GzipWriter(ratioBinnedPath))
                    {
                        string   normalLine;
                        string   tumorLine;
                        string[] normalToks;
                        string[] tumorToks;
                        double   normalCount;
                        double   tumorCount;
                        double   ratio;
                        while ((normalLine = normalReader.ReadLine()) != null)
                        {
                            tumorLine   = tumorReader.ReadLine();
                            normalToks  = normalLine.Split('\t');
                            tumorToks   = tumorLine.Split('\t');
                            normalCount = double.Parse(normalToks[3]);
                            tumorCount  = double.Parse(tumorToks[3]);
                            // The weighted average count of a bin could be less than 1.
                            // Using these small counts for coverage normalization creates large ratios.
                            // It would be better to just drop these bins so we don't introduce too much noise into segmentation and CNV calling.
                            if (normalCount < 1)
                            {
                                continue;
                            }                          // skip the bin
                            string chrom = normalToks[0];
                            int    start = int.Parse(normalToks[1]);
                            int    end   = int.Parse(normalToks[2]);
                            // get the normal ploidy from intervalsWithPloidyByChrom
                            double factor = CanvasDiploidBinRatioFactor * GetPloidy(referencePloidy, chrom, start, end) / 2.0;
                            ratio         = tumorCount / normalCount * factor * librarySizeFactor;
                            normalToks[3] = String.Format("{0}", ratio);
                            writer.WriteLine(String.Join("\t", normalToks));
                        }
                    }
        }
Esempio n. 13
0
 /// <summary>
 /// Assume that the rows are sorted by the start position and ascending order
 /// </summary>
 private void ReadBEDInput()
 {
     try
     {
         Dictionary <string, List <uint> > startByChr   = new Dictionary <string, List <uint> >(),
                                           endByChr     = new Dictionary <string, List <uint> >();
         Dictionary <string, List <double> > scoreByChr = new Dictionary <string, List <double> >();
         // Create an instance of StreamReader to read from a file.
         // The using statement also closes the StreamReader.
         using (GzipReader reader = new GzipReader(this.InputBinPath))
         {
             string   line;
             string[] tokens;
             while ((line = reader.ReadLine()) != null)
             {
                 tokens = line.Split('\t');
                 string chr = tokens[Segmentation.idxChr].Trim();
                 if (!startByChr.ContainsKey(chr))
                 {
                     startByChr.Add(chr, new List <uint>());
                     endByChr.Add(chr, new List <uint>());
                     scoreByChr.Add(chr, new List <double>());
                 }
                 startByChr[chr].Add(Convert.ToUInt32(tokens[Segmentation.idxStart].Trim()));
                 endByChr[chr].Add(Convert.ToUInt32(tokens[Segmentation.idxEnd].Trim()));
                 scoreByChr[chr].Add(Convert.ToDouble(tokens[this.idxScore].Trim()));
             }
             foreach (string chr in startByChr.Keys)
             {
                 this.StartByChr[chr] = startByChr[chr].ToArray();
                 this.EndByChr[chr]   = endByChr[chr].ToArray();
                 this.ScoreByChr[chr] = scoreByChr[chr].ToArray();
             }
         }
     }
     catch (Exception e)
     {
         Console.Error.WriteLine("File {0} could not be read:", this.InputBinPath);
         Console.Error.WriteLine(e.Message);
         Environment.Exit(1);
     }
 }
Esempio n. 14
0
        public static IEnumerable <SampleGenomicBin> IterateThroughTextFile(string infile)
        {
            using (GzipReader reader = new GzipReader(infile))
            {
                string row;

                while ((row = reader.ReadLine()) != null)
                {
                    string[] fields = row.Split('\t');

                    string chr   = fields[0];
                    int    start = Convert.ToInt32(fields[1]);
                    int    stop  = Convert.ToInt32(fields[2]);
                    float  count = float.Parse(fields[3]);
                    int    gc    = Convert.ToInt32(fields[4]);

                    SampleGenomicBin bin = new SampleGenomicBin(chr, start, stop, gc, count);
                    yield return(bin);
                }
            }
        }
Esempio n. 15
0
        public static PloidyInfo LoadPloidyFromBedFile(string filePath)
        {
            PloidyInfo ploidy = new PloidyInfo();
            int        count  = 0;

            using (GzipReader reader = new GzipReader(filePath))
            {
                while (true)
                {
                    string fileLine = reader.ReadLine();
                    if (fileLine == null)
                    {
                        break;
                    }
                    if (fileLine.StartsWith("##ExpectedSexChromosomeKaryotype"))
                    {
                        ploidy.HeaderLine = fileLine.Trim();
                        continue;
                    }
                    if (fileLine.Length == 0 || fileLine[0] == '#')
                    {
                        continue;
                    }
                    string[] bits       = fileLine.Split('\t');
                    string   chromosome = bits[0];
                    if (!ploidy.PloidyByChromosome.ContainsKey(chromosome))
                    {
                        ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>();
                    }
                    PloidyInterval interval = new PloidyInterval();
                    interval.Start  = int.Parse(bits[1]);
                    interval.End    = int.Parse(bits[2]);
                    interval.Ploidy = int.Parse(bits[4]);
                    ploidy.PloidyByChromosome[chromosome].Add(interval);
                    count++;
                }
            }
            Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count);
            return(ploidy);
        }
Esempio n. 16
0
        /// <summary>
        /// Parse the outputs of CanvasSNV, and note these variant frequencies in the appropriate segment.
        /// </summary>
        public static float LoadVariantFrequencies(string variantFrequencyFile, List <CanvasSegment> segments)
        {
            Console.WriteLine("{0} Load variant frequencies from {1}", DateTime.Now, variantFrequencyFile);
            int count = 0;
            Dictionary <string, List <CanvasSegment> > segmentsByChromosome = CanvasSegment.GetSegmentsByChromosome(segments);
            Dictionary <string, string> alternativeNames = GetChromosomeAlternativeNames(segmentsByChromosome.Keys);
            long totalCoverage = 0;
            int  totalRecords  = 0;

            using (GzipReader reader = new GzipReader(variantFrequencyFile))
            {
                while (true)
                {
                    string fileLine = reader.ReadLine();
                    if (fileLine == null)
                    {
                        break;
                    }
                    if (fileLine.Length == 0 || fileLine[0] == '#')
                    {
                        continue;                                             // Skip headers
                    }
                    string[] bits = fileLine.Split('\t');
                    if (bits.Length < 6)
                    {
                        Console.Error.WriteLine("* Bad line in {0}: '{1}'", variantFrequencyFile, fileLine);
                        continue;
                    }
                    string chromosome = bits[0];
                    if (!segmentsByChromosome.ContainsKey(chromosome))
                    {
                        if (alternativeNames.ContainsKey(chromosome))
                        {
                            chromosome = alternativeNames[chromosome];
                        }
                        else
                        {
                            continue;
                        }
                    }

                    int position = int.Parse(bits[1]); // 1-based (from the input VCF to Canvas SNV)
                    int countRef = int.Parse(bits[4]);
                    int countAlt = int.Parse(bits[5]);
                    if (countRef + countAlt < 10)
                    {
                        continue;
                    }
                    float VF = countAlt / (float)(countRef + countAlt);
                    // Binary search for the segment this variant hits:
                    List <CanvasSegment> chrSegments = segmentsByChromosome[chromosome];
                    int start = 0;
                    int end   = chrSegments.Count - 1;
                    int mid   = (start + end) / 2;
                    while (start <= end)
                    {
                        if (chrSegments[mid].End < position) // CanvasSegment.End is already 1-based
                        {
                            start = mid + 1;
                            mid   = (start + end) / 2;
                            continue;
                        }
                        if (chrSegments[mid].Begin + 1 > position) // Convert CanvasSegment.Begin to 1-based by adding 1
                        {
                            end = mid - 1;
                            mid = (start + end) / 2;
                            continue;
                        }
                        chrSegments[mid].VariantFrequencies.Add(VF);
                        chrSegments[mid].VariantTotalCoverage.Add(countRef + countAlt);
                        count++;
                        totalCoverage += (countRef + countAlt); // use only coverage information in segments
                        totalRecords++;
                        break;
                    }
                }
            }
            float meanCoverage = 0;

            if (totalRecords > 0)
            {
                meanCoverage = totalCoverage / Math.Max(1f, totalRecords);
            }
            Console.WriteLine("{0} Loaded a total of {1} usable variant frequencies", DateTime.Now, count);
            return(meanCoverage);
        }
Esempio n. 17
0
        protected void LoadKnownCNVCF(string oracleVCFPath)
        {
            bool stripChr = false;

            // Load our "oracle" of known copy numbers:
            this.KnownCN = new Dictionary <string, List <CNInterval> >();
            int count = 0;

            using (GzipReader reader = new GzipReader(oracleVCFPath))
            {
                while (true)
                {
                    string fileLine = reader.ReadLine();
                    if (fileLine == null)
                    {
                        break;
                    }
                    if (fileLine.Length == 0 || fileLine[0] == '#')
                    {
                        continue;
                    }
                    string[] bits = fileLine.Split('\t');
                    if (bits.Length == 1 && bits[0].Trim().Length == 0)
                    {
                        continue;                                                 // skip empty lines!
                    }
                    string chromosome = bits[0];
                    if (stripChr)
                    {
                        chromosome = chromosome.Replace("chr", "");
                    }
                    if (!KnownCN.ContainsKey(chromosome))
                    {
                        KnownCN[chromosome] = new List <CNInterval>();
                    }
                    CNInterval interval = new CNInterval();
                    interval.Start = int.Parse(bits[1]);
                    interval.CN    = -1;
                    string[] infoBits = bits[7].Split(';');
                    foreach (string subBit in infoBits)
                    {
                        if (subBit.StartsWith("CN="))
                        {
                            float tempCN = float.Parse(subBit.Substring(3));
                            if (subBit.EndsWith(".5"))
                            {
                                interval.CN = (int)Math.Round(tempCN + 0.1); // round X.5 up to X+1
                            }
                            else
                            {
                                interval.CN = (int)Math.Round(tempCN); // Round off
                            }
                        }
                        if (subBit.StartsWith("END="))
                        {
                            interval.End = int.Parse(subBit.Substring(4));
                        }
                    }
                    // Parse CN from Canvas output:
                    if (bits.Length > 8)
                    {
                        string[] subBits  = bits[8].Split(':');
                        string[] subBits2 = bits[9].Split(':');
                        for (int subBitIndex = 0; subBitIndex < subBits.Length; subBitIndex++)
                        {
                            if (subBits[subBitIndex] == "CN")
                            {
                                interval.CN = int.Parse(subBits2[subBitIndex]);
                            }
                        }
                    }
                    if (interval.End == 0 || interval.CN < 0)
                    {
                        Console.WriteLine("Error - bogus record!");
                        Console.WriteLine(fileLine);
                    }
                    else
                    {
                        KnownCN[chromosome].Add(interval);
                        count++;
                    }
                }
            }
            Console.WriteLine(">>>Loaded {0} known-CN intervals", count);
        }
Esempio n. 18
0
        /// <summary>
        /// Loads in data produced by CanvasPartition.exe.
        /// </summary>
        /// <param name="infile">Input file.</param>
        /// <returns>A list of segments.</returns>
        public static List <CanvasSegment> ReadSegments(string infile)
        {
            Console.WriteLine("{0} Read segments from {1}", DateTime.Now, infile);
            List <CanvasSegment> segments = new List <CanvasSegment>();

            string chr   = null;
            int    begin = -1;

            int              previousSegmentIndex = -1;
            int              previousBinStart     = 0;
            int              previousBinEnd       = 0;
            List <float>     counts         = new List <float>();
            Tuple <int, int> segmentStartCI = null;

            using (GzipReader reader = new GzipReader(infile))
            {
                string row = null;

                while ((row = reader.ReadLine()) != null)
                {
                    string[] fields = row.Split('\t');

                    int currentSegmentIndex = Convert.ToInt32(fields[4]);
                    int newBinStart         = Convert.ToInt32(fields[1]);
                    int newBinEnd           = Convert.ToInt32(fields[2]);

                    // We've moved to a new segment
                    if (currentSegmentIndex != previousSegmentIndex)
                    {
                        // Make a segment
                        if (previousSegmentIndex != -1)
                        {
                            CanvasSegment segment = new CanvasSegment(chr, begin, previousBinEnd, counts);
                            // Prepare the confidence interval for the end of the segment that just ended, based on the size of its last bin
                            // (and, if the segments abut, based on the size of the next segment's first bin):
                            int CIEnd1 = -(previousBinEnd - previousBinStart) / 2;
                            int CIEnd2 = -CIEnd1;
                            if (previousBinEnd == newBinStart)
                            {
                                CIEnd2 = (newBinEnd - newBinStart) / 2;
                            }
                            segment.EndConfidenceInterval   = new Tuple <int, int>(CIEnd1, CIEnd2);
                            segment.StartConfidenceInterval = segmentStartCI;
                            segments.Add(segment);
                            counts.Clear();

                            // Prepare the confidence interval for the start of the segment that just started, based on the size of its first
                            // bin (and, if the segments abut, based on the size of the previous segment's last bin):
                            int CIStart2 = (newBinEnd - newBinStart) / 2;
                            int CIStart1 = -CIStart2;
                            if (previousBinEnd == newBinStart)
                            {
                                CIStart1 = -(previousBinEnd - previousBinStart) / 2;
                            }
                            segmentStartCI = new Tuple <int, int>(CIStart1, CIStart2);
                        }
                        else
                        {
                            int interval = (newBinEnd - newBinStart) / 2;
                            segmentStartCI = new Tuple <int, int>(-interval, interval);
                        }
                        chr   = fields[0];
                        begin = Convert.ToInt32(fields[1]);
                        previousSegmentIndex = currentSegmentIndex;
                    }
                    previousBinStart = newBinStart;
                    previousBinEnd   = newBinEnd;

                    counts.Add(float.Parse(fields[3]));
                }

                if (previousSegmentIndex != -1)
                {
                    // Add the last segment
                    CanvasSegment segment = new CanvasSegment(chr, begin, previousBinEnd, counts);
                    segments.Add(segment);
                    segment.StartConfidenceInterval = segmentStartCI;
                }
            }
            Console.WriteLine("{0} Loaded {1} segments", DateTime.Now, segments.Count);
            return(segments);
        }
Esempio n. 19
0
        /// <summary>
        /// Loads .cleaned bed files, merges bins from multiple samples and returns GenomicBin objects with MultiSampleCount
        /// </summary>
        public static Dictionary <string, List <MultiSampleGenomicBin> > MergeMultiSampleCleanedBedFile(List <IFileLocation> canvasCleanBedPaths)
        {
            // initialize variables to hold multi-sample bed files
            Dictionary <string, List <MultiSampleGenomicBin> >    multiSampleGenomicBins = new Dictionary <string, List <MultiSampleGenomicBin> >();
            Dictionary <string, Dictionary <int, int> >           start     = new Dictionary <string, Dictionary <int, int> >();
            Dictionary <string, Dictionary <int, int> >           stop      = new Dictionary <string, Dictionary <int, int> >();
            Dictionary <string, Dictionary <int, List <float> > > binCounts = new Dictionary <string, Dictionary <int, List <float> > >();
            List <int>       counts      = new List <int>();
            HashSet <string> chromosomes = new HashSet <string>();

            Console.WriteLine("Merge and normalize CanvasClean bed files");

            foreach (IFileLocation bedPath in canvasCleanBedPaths)
            {
                int count = 0;
                using (GzipReader reader = new GzipReader(bedPath.FullName))
                {
                    while (true)
                    {
                        string fileLine = reader.ReadLine();
                        if (fileLine == null)
                        {
                            break;
                        }
                        string[] lineBedFile = fileLine.Split('\t');
                        string   chr         = lineBedFile[0];
                        if (!chromosomes.Contains(chr))
                        {
                            chromosomes.Add(chr);
                        }
                        count++;
                    }
                }
                counts.Add(count);
                Console.WriteLine($"count {count}");
            }
            foreach (string chr in chromosomes)
            {
                start[chr]     = new Dictionary <int, int>();
                stop[chr]      = new Dictionary <int, int>();
                binCounts[chr] = new Dictionary <int, List <float> >();
            }

            // read counts and segmentIDs
            foreach (IFileLocation bedPath in canvasCleanBedPaths)
            {
                Console.WriteLine(bedPath);

                using (GzipReader reader = new GzipReader(bedPath.FullName))
                {
                    while (true)
                    {
                        string fileLine = reader.ReadLine();
                        if (fileLine == null)
                        {
                            break;
                        }
                        string[] lineBedFile = fileLine.Split('\t');
                        string   chr         = lineBedFile[0];
                        int      pos         = int.Parse(lineBedFile[1]);
                        start[chr][pos] = pos;
                        stop[chr][pos]  = int.Parse(lineBedFile[2]);

                        if (binCounts[chr].ContainsKey(pos))
                        {
                            binCounts[chr][pos].Add(float.Parse(lineBedFile[3]));
                        }
                        else
                        {
                            binCounts[chr][pos] = new List <float> {
                                float.Parse(lineBedFile[3])
                            }
                        };
                    }
                }
            }
            Console.WriteLine("create GenomeBin intervals");

            // create GenomeBin intervals

            foreach (string chr in chromosomes)
            {
                if (!multiSampleGenomicBins.ContainsKey(chr))
                {
                    multiSampleGenomicBins[chr] = new List <MultiSampleGenomicBin>();
                }
                var binStartPositions = start[chr].Keys.ToList();
                foreach (var binStartPosition in binStartPositions)
                {
                    // if outlier is removed in one sample, remove it in all samples
                    if (binCounts[chr][binStartPosition].Count < canvasCleanBedPaths.Count)
                    {
                        continue;
                    }
                    if (binStartPosition < 0)
                    {
                        throw new Illumina.Common.IlluminaException($"Start must be non-negative");
                    }
                    if (binStartPosition >= stop[chr][binStartPosition]) // Do not allow empty intervals
                    {
                        throw new Illumina.Common.IlluminaException($"Start must be less than Stop");
                    }
                    GenomicBin interval = new GenomicBin(chr, new BedInterval(binStartPosition, stop[chr][binStartPosition]));
                    multiSampleGenomicBins[chr].Add(new MultiSampleGenomicBin(interval, binCounts[chr][binStartPosition]));
                }
            }
            return(multiSampleGenomicBins);
        }
Esempio n. 20
0
        /// <summary>
        /// Intersect bins with the targeted regions defined in callset.Manifest.
        /// Assumes that the targeted regions don't intersect, the bins are sorted by genomic location and the bins don't intersect.
        /// </summary>
        /// <param name="callset"></param>
        /// <param name="partitionedPath">Output of CanvasPartition. Bins are assumed to be sorted</param>
        /// <returns></returns>
        private IFileLocation IntersectBinsWithTargetedRegions(CanvasCallset callset, IFileLocation partitionedPath)
        {
            if (!partitionedPath.Exists)
            {
                return(partitionedPath);
            }
            var rawPartitionedPath = partitionedPath.AppendName(".raw");

            if (rawPartitionedPath.Exists)
            {
                rawPartitionedPath.Delete();
            }
            partitionedPath.MoveTo(rawPartitionedPath);

            //callset.Manifest
            Dictionary <string, List <NexteraManifest.ManifestRegion> > manifestRegionsByChrom = callset.Manifest.GetManifestRegionsByChromosome();

            // CanvasPartition output file is in the BED format
            //   start: 0-based, inclusive
            //   end: 0-based, exclusive
            // Manifest
            //   start: 1-based, inclusive
            //   end: 1-based, inclusive
            using (GzipReader reader = new GzipReader(rawPartitionedPath.FullName))
                using (GzipWriter writer = new GzipWriter(partitionedPath.FullName))
                {
                    string   currentChrom      = null;
                    int      manifestRegionIdx = 0;
                    string   line;
                    string[] toks;
                    while ((line = reader.ReadLine()) != null)
                    {
                        toks = line.Split('\t');
                        string chrom = toks[0];
                        int    start = int.Parse(toks[1]) + 1; // 1-based, inclusive
                        int    end   = int.Parse(toks[2]);     // 1-based, inclusive
                        if (chrom != currentChrom)
                        {
                            currentChrom      = chrom;
                            manifestRegionIdx = 0;
                        }
                        if (!manifestRegionsByChrom.ContainsKey(currentChrom))
                        {
                            continue;
                        }
                        while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count &&
                               manifestRegionsByChrom[currentChrom][manifestRegionIdx].End < start) // |- manifest region -| |- bin -|
                        {
                            manifestRegionIdx++;
                        }
                        if (manifestRegionIdx >= manifestRegionsByChrom[currentChrom].Count ||   // |- last manifest region -| |- bin -|
                            end < manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start) // |- bin -| |- manifest region -|
                        {
                            continue;                                                            // skip bin
                        }

                        // |- bin -|
                        //       |- manifest region -|
                        while (manifestRegionIdx < manifestRegionsByChrom[currentChrom].Count &&
                               end >= manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start)
                        {
                            // calculate intersection
                            int intersectionStart = Math.Max(start, manifestRegionsByChrom[currentChrom][manifestRegionIdx].Start); // 1-based, inclusive
                            int intersectionEnd   = Math.Min(end, manifestRegionsByChrom[currentChrom][manifestRegionIdx].End);     // 1-based, inclusive
                                                                                                                                    // start/end in BED format
                            toks[1] = String.Format("{0}", intersectionStart - 1);                                                  // 0-based, inclusive
                            toks[2] = String.Format("{0}", intersectionEnd);                                                        // 0-based, exclusive

                            // write intersected bin
                            writer.WriteLine(String.Join("\t", toks));

                            manifestRegionIdx++;
                        }
                    }
                }

            return(partitionedPath);
        }
Esempio n. 21
0
        /// <summary>
        /// Invoke CanvasSNV.  Return null if this fails and we need to abort CNV calling for this sample.
        /// </summary>
        protected void InvokeCanvasSnv(CanvasCallset callset)
        {
            List <UnitOfWork> jobList        = new List <UnitOfWork>();
            List <string>     outputPaths    = new List <string>();
            GenomeMetadata    genomeMetadata = callset.GenomeMetadata;

            string tumorBamPath  = callset.Bam.BamFile.FullName;
            string normalVcfPath = callset.NormalVcfPath.FullName;

            foreach (GenomeMetadata.SequenceMetadata chromosome in genomeMetadata.Sequences)
            {
                // Only invoke for autosomes + allosomes;
                // don't invoke it for mitochondrial chromosome or extra contigs or decoys
                if (chromosome.Type != GenomeMetadata.SequenceType.Allosome && !chromosome.IsAutosome())
                {
                    continue;
                }

                UnitOfWork job = new UnitOfWork();
                job.ExecutablePath = Path.Combine(_canvasFolder, "CanvasSNV.exe");
                if (CrossPlatform.IsThisMono())
                {
                    job.CommandLine    = job.ExecutablePath;
                    job.ExecutablePath = Utilities.GetMonoPath();
                }

                string outputPath = Path.Combine(callset.TempFolder, string.Format("{0}-{1}.SNV.txt.gz", chromosome.Name, callset.Id));
                outputPaths.Add(outputPath);
                job.CommandLine += $" {chromosome.Name} {normalVcfPath} {tumorBamPath} {outputPath}";
                if (_customParameters.ContainsKey("CanvasSNV"))
                {
                    job.CommandLine = Utilities.MergeCommandLineOptions(job.CommandLine, _customParameters["CanvasSNV"], true);
                }
                job.LoggingFolder = _workManager.LoggingFolder.FullName;
                job.LoggingStub   = string.Format("CanvasSNV-{0}-{1}", callset.Id, chromosome.Name);
                jobList.Add(job);
            }
            Console.WriteLine("Invoking {0} processor jobs...", jobList.Count);

            // Invoke CanvasSNV jobs:
            Console.WriteLine(">>>CanvasSNV start...");
            _workManager.DoWorkParallelThreads(jobList);
            Console.WriteLine(">>>CanvasSNV complete!");

            // Concatenate CanvasSNV results:
            using (GzipWriter writer = new GzipWriter(callset.VfSummaryPath))
            {
                bool headerWritten = false;
                foreach (string outputPath in outputPaths)
                {
                    if (!File.Exists(outputPath))
                    {
                        Console.WriteLine("Error: Expected output file not found at {0}", outputPath);
                        continue;
                    }
                    using (GzipReader reader = new GzipReader(outputPath))
                    {
                        while (true)
                        {
                            string fileLine = reader.ReadLine();
                            if (fileLine == null)
                            {
                                break;
                            }
                            if (fileLine.Length > 0 && fileLine[0] == '#')
                            {
                                if (headerWritten)
                                {
                                    continue;
                                }
                                headerWritten = true;
                            }
                            writer.WriteLine(fileLine);
                        }
                    }
                }
            }
        }