Exemplo n.º 1
0
        public override IEnumerable <string> Process()
        {
            var entries = (from line in File.ReadAllLines(options.InputFile)
                           let parts = line.Split('\t')
                                       where parts.Length >= 3
                                       select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2] }).ToList();

            var groups = entries.GroupBy(m => m.GroupName).ToList();

            var result = new List <string>();

            foreach (var group in groups)
            {
                var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount");
                result.Add(catfile);
                using (var sw = new StreamWriter(catfile))
                {
                    sw.WriteLine("SampleName\tCategory\tLevel\tCount");

                    foreach (var entry in group)
                    {
                        Progress.SetMessage("Reading smallRNA mapped info file " + entry.SmallRNAFile + " ...");

                        var map = new MapItemReader(0, 1, hasHeader: false).ReadFromFile(entry.SmallRNAFile);

                        var totalReads    = Math.Round(double.Parse(map["TotalReads"].Value));
                        var mappedReads   = Math.Round(double.Parse(map["MappedReads"].Value));
                        var smallRNAReads = Math.Round(double.Parse(map["FeatureReads"].Value));

                        sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, TotalReadsKey, totalReads);
                        sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, MappedReadsKey, mappedReads);
                        sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

                        sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, UnmappedKey, totalReads - mappedReads);
                        sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, OtherMappedKey, mappedReads - smallRNAReads);
                        sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

                        foreach (var biotype in SmallRNAConsts.Biotypes)
                        {
                            if (map.ContainsKey(biotype))
                            {
                                sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, biotype, 2, Math.Round(double.Parse(map[biotype].Value)));
                            }
                        }
                    }
                }

                var data = (from line in File.ReadAllLines(catfile).Skip(1)
                            where !string.IsNullOrWhiteSpace(line)
                            let parts = line.Split('\t')
                                        let level = double.Parse(parts[2])
                                                    where !(parts[1].Equals(smallRNAKey) && level == 1)
                                                    select new
                {
                    SampleName = parts[0],
                    Category = parts[1],
                    Level = level,
                    Count = int.Parse(parts[3])
                }).ToList();

                var tablefile = catfile + ".tsv";
                result.Add(tablefile);
                using (var sw = new StreamWriter(tablefile))
                {
                    var samples = (from d in data
                                   select d.SampleName).Distinct().OrderBy(m => m).ToList();

                    sw.WriteLine("Category\t{0}", samples.Merge("\t"));

                    var categories = new string[] { TotalReadsKey, MappedReadsKey, UnmappedKey, OtherMappedKey, smallRNAKey }.Union(SmallRNAConsts.Biotypes).ToList();

                    Console.WriteLine(categories.Merge("\n"));

                    var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category);
                    foreach (var cat in categories)
                    {
                        sw.WriteLine("{0}\t{1}", cat,
                                     (from sample in samples
                                      let dic = map[sample]
                                                select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t"));
                    }
                }

                var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName;
                if (File.Exists(rfile))
                {
                    var targetrfile = catfile + ".r";
                    using (var sw = new StreamWriter(targetrfile))
                    {
                        sw.WriteLine("catfile<-\"{0}\"", catfile);
                        sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory);
                        sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0");
                        string line = File.ReadAllText(rfile);
                        using (var sr = new StreamReader(rfile))
                        {
                            if (line.Contains("#predefine_end"))
                            {
                                while ((line = sr.ReadLine()) != null)
                                {
                                    if (line.Contains("#predefine_end"))
                                    {
                                        break;
                                    }
                                }
                            }

                            while ((line = sr.ReadLine()) != null)
                            {
                                sw.WriteLine(line);
                            }
                        }
                    }
                    SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\"");
                }
            }
            return(result);
        }
        public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths)
        {
            Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile);
            var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile);
            var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length"));

            if (lengthIndex < 0)
            {
                throw new Exception("Cannot find length column in file " + options.GeneLengthFile);
            }
            var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));

            Progress.SetMessage("Reading count table from {0} ...", options.InputFile);
            var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile);

            if (!string.IsNullOrEmpty(options.KeyRegex))
            {
                var reg = new Regex(options.KeyRegex);
                geneLengthMap           = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value);
                counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value;
            }

            Dictionary <string, double> sampleReads;

            if (File.Exists(options.SampleReadsFile))
            {
                Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile);
                sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));
            }
            else //use total mapped reads as total reads
            {
                sampleReads = new Dictionary <string, double>();
                for (int iSample = 0; iSample < counts.Samples.Length; iSample++)
                {
                    double itotal = 0.0;
                    for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++)
                    {
                        itotal += counts.Count[iGene, iSample];
                    }

                    sampleReads[counts.Samples[iSample]] = itotal;
                }
            }

            foreach (var sample in counts.Samples)
            {
                if (!sampleReads.ContainsKey(sample))
                {
                    throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile));
                }
            }

            foreach (var geneValues in counts.GeneValues)
            {
                if (!geneLengthMap.ContainsKey(geneValues[0]))
                {
                    throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile));
                }
            }

            sampleCounts = (from sample in counts.Samples
                            select sampleReads[sample]).ToArray();

            geneLengths = (from geneValues in counts.GeneValues
                           select geneLengthMap[geneValues[0]]).ToArray();

            for (int iGene = 0; iGene < geneLengths.Length; iGene++)
            {
                for (int iSample = 0; iSample < sampleCounts.Length; iSample++)
                {
                    counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]);
                }
            }
            return(counts);
        }
    public override IEnumerable<string> Process()
    {
      var entries = (from line in File.ReadAllLines(options.InputFile)
                     let parts = line.Split('\t')
                     where parts.Length >= 3
                     select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2] }).ToList();

      var groups = entries.GroupBy(m => m.GroupName).ToList();

      var result = new List<string>();

      foreach (var group in groups)
      {
        var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount");
        result.Add(catfile);
        using (var sw = new StreamWriter(catfile))
        {
          sw.WriteLine("SampleName\tCategory\tLevel\tCount");

          foreach (var entry in group)
          {
            Progress.SetMessage("Reading smallRNA mapped info file " + entry.SmallRNAFile + " ...");

            var map = new MapItemReader(0, 1, hasHeader: false).ReadFromFile(entry.SmallRNAFile);

            var totalReads = Math.Round(double.Parse(map["TotalReads"].Value));
            var mappedReads = Math.Round(double.Parse(map["MappedReads"].Value));
            var smallRNAReads = Math.Round(double.Parse(map["FeatureReads"].Value));

            sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, TotalReadsKey, totalReads);
            sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, MappedReadsKey, mappedReads);
            sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

            sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, UnmappedKey, totalReads - mappedReads);
            sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, OtherMappedKey, mappedReads - smallRNAReads);
            sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

            foreach (var biotype in SmallRNAConsts.Biotypes)
            {
              if (map.ContainsKey(biotype))
              {
                sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, biotype, 2, Math.Round(double.Parse(map[biotype].Value)));
              }
            }
          }
        }

        var data = (from line in File.ReadAllLines(catfile).Skip(1)
                    where !string.IsNullOrWhiteSpace(line)
                    let parts = line.Split('\t')
                    let level = double.Parse(parts[2])
                    where !(parts[1].Equals(smallRNAKey) && level == 1)
                    select new
                    {
                      SampleName = parts[0],
                      Category = parts[1],
                      Level = level,
                      Count = int.Parse(parts[3])
                    }).ToList();

        var tablefile = catfile + ".tsv";
        result.Add(tablefile);
        using (var sw = new StreamWriter(tablefile))
        {
          var samples = (from d in data
                         select d.SampleName).Distinct().OrderBy(m => m).ToList();

          sw.WriteLine("Category\t{0}", samples.Merge("\t"));

          var categories = new string[] { TotalReadsKey, MappedReadsKey, UnmappedKey, OtherMappedKey, smallRNAKey }.Union(SmallRNAConsts.Biotypes).ToList();

          Console.WriteLine(categories.Merge("\n"));

          var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category);
          foreach (var cat in categories)
          {
            sw.WriteLine("{0}\t{1}", cat,
              (from sample in samples
               let dic = map[sample]
               select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t"));
          }
        }

        var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName;
        if (File.Exists(rfile))
        {
          var targetrfile = catfile + ".r";
          using (var sw = new StreamWriter(targetrfile))
          {
            sw.WriteLine("catfile<-\"{0}\"", catfile);
            sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory);
            sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0");
            string line = File.ReadAllText(rfile);
            using (var sr = new StreamReader(rfile))
            {
              if (line.Contains("#predefine_end"))
              {
                while ((line = sr.ReadLine()) != null)
                {
                  if (line.Contains("#predefine_end"))
                  {
                    break;
                  }
                }
              }

              while ((line = sr.ReadLine()) != null)
              {
                sw.WriteLine(line);
              }
            }
          }
          SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\"");
        }
      }
      return result;
    }
    public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths)
    {
      Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile);
      var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile);
      var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length"));
      if (lengthIndex < 0)
      {
        throw new Exception("Cannot find length column in file " + options.GeneLengthFile);
      }
      var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));

      Progress.SetMessage("Reading count table from {0} ...", options.InputFile);
      var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile);

      if (!string.IsNullOrEmpty(options.KeyRegex))
      {
        var reg = new Regex(options.KeyRegex);
        geneLengthMap = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value);
        counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value;
      }

      Dictionary<string, double> sampleReads;
      if (File.Exists(options.SampleReadsFile))
      {
        Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile);
        sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));
      }
      else //use total mapped reads as total reads
      {
        sampleReads = new Dictionary<string, double>();
        for (int iSample = 0; iSample < counts.Samples.Length; iSample++)
        {
          double itotal = 0.0;
          for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++)
          {
            itotal += counts.Count[iGene, iSample];
          }

          sampleReads[counts.Samples[iSample]] = itotal;
        }
      }

      foreach (var sample in counts.Samples)
      {
        if (!sampleReads.ContainsKey(sample))
        {
          throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile));
        }
      }

      foreach (var geneValues in counts.GeneValues)
      {
        if (!geneLengthMap.ContainsKey(geneValues[0]))
        {
          throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile));
        }
      }

      sampleCounts = (from sample in counts.Samples
                          select sampleReads[sample]).ToArray();

      geneLengths = (from geneValues in counts.GeneValues
                     select geneLengthMap[geneValues[0]]).ToArray();

      for (int iGene = 0; iGene < geneLengths.Length; iGene++)
      {
        for (int iSample = 0; iSample < sampleCounts.Length; iSample++)
        {
          counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]);
        }
      }
      return counts;
    }