Exemple #1
0
        public static void SummarizeExtractedData(string dir)
        {
            var designs = Directory.GetFiles(dir, "*.design.tsv");

            using (var sw = new StreamWriter(dir + "\\design_overlap.tsv"))
                using (var swSummary = new StreamWriter(dir + "\\design_summary.tsv"))
                {
                    var reader = new MapItemReader(2, 4);
                    var data   = (from design in designs
                                  from item in reader.ReadFromFile(design)
                                  select item).ToList();
                    var dataMap   = data.ToGroupDictionary(m => m.Key);
                    var samples   = (from d in data select d.Key).Distinct().OrderBy(m => m).ToList();
                    var platforms = (from d in data select d.Value.Value).Distinct().ToList();

                    var dMap = (from d in data
                                select new { Sample = d.Key, Platform = d.Value.Value }).ToDoubleDictionary(m => m.Platform, m => m.Sample);

                    sw.WriteLine("Sample\t" + platforms.Merge("\t"));
                    foreach (var sample in samples)
                    {
                        var sampleMap = new HashSet <string>(dataMap[sample].ConvertAll(m => m.Value.Value));
                        sw.WriteLine("{0}\t{1}",
                                     sample,
                                     (from p in platforms
                                      select sampleMap.Contains(p) ? "+" : "").Merge("\t"));
                    }

                    swSummary.WriteLine("\t" + platforms.Merge("\t"));
                    for (int i = 0; i < platforms.Count; i++)
                    {
                        swSummary.Write(platforms[i]);
                        for (int j = 0; j < platforms.Count; j++)
                        {
                            swSummary.Write("\t{0}", dMap[platforms[i]].Keys.Intersect(dMap[platforms[j]].Keys).Count());
                        }
                        swSummary.WriteLine();
                    }
                }
        }
Exemple #2
0
        public override IEnumerable <string> Process()
        {
            var entries = (from line in File.ReadAllLines(options.InputFile)
                           let parts = line.Split('\t')
                                       where parts.Length >= 3
                                       select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2] }).ToList();

            var groups = entries.GroupBy(m => m.GroupName).ToList();

            var result = new List <string>();

            foreach (var group in groups)
            {
                var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount");
                result.Add(catfile);
                using (var sw = new StreamWriter(catfile))
                {
                    sw.WriteLine("SampleName\tCategory\tLevel\tCount");

                    foreach (var entry in group)
                    {
                        Progress.SetMessage("Reading smallRNA mapped info file " + entry.SmallRNAFile + " ...");

                        var map = new MapItemReader(0, 1, hasHeader: false).ReadFromFile(entry.SmallRNAFile);

                        var totalReads    = Math.Round(double.Parse(map["TotalReads"].Value));
                        var mappedReads   = Math.Round(double.Parse(map["MappedReads"].Value));
                        var smallRNAReads = Math.Round(double.Parse(map["FeatureReads"].Value));

                        sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, TotalReadsKey, totalReads);
                        sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, MappedReadsKey, mappedReads);
                        sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

                        sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, UnmappedKey, totalReads - mappedReads);
                        sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, OtherMappedKey, mappedReads - smallRNAReads);
                        sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

                        foreach (var biotype in SmallRNAConsts.Biotypes)
                        {
                            if (map.ContainsKey(biotype))
                            {
                                sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, biotype, 2, Math.Round(double.Parse(map[biotype].Value)));
                            }
                        }
                    }
                }

                var data = (from line in File.ReadAllLines(catfile).Skip(1)
                            where !string.IsNullOrWhiteSpace(line)
                            let parts = line.Split('\t')
                                        let level = double.Parse(parts[2])
                                                    where !(parts[1].Equals(smallRNAKey) && level == 1)
                                                    select new
                {
                    SampleName = parts[0],
                    Category = parts[1],
                    Level = level,
                    Count = int.Parse(parts[3])
                }).ToList();

                var tablefile = catfile + ".tsv";
                result.Add(tablefile);
                using (var sw = new StreamWriter(tablefile))
                {
                    var samples = (from d in data
                                   select d.SampleName).Distinct().OrderBy(m => m).ToList();

                    sw.WriteLine("Category\t{0}", samples.Merge("\t"));

                    var categories = new string[] { TotalReadsKey, MappedReadsKey, UnmappedKey, OtherMappedKey, smallRNAKey }.Union(SmallRNAConsts.Biotypes).ToList();

                    Console.WriteLine(categories.Merge("\n"));

                    var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category);
                    foreach (var cat in categories)
                    {
                        sw.WriteLine("{0}\t{1}", cat,
                                     (from sample in samples
                                      let dic = map[sample]
                                                select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t"));
                    }
                }

                var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName;
                if (File.Exists(rfile))
                {
                    var targetrfile = catfile + ".r";
                    using (var sw = new StreamWriter(targetrfile))
                    {
                        sw.WriteLine("catfile<-\"{0}\"", catfile);
                        sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory);
                        sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0");
                        string line = File.ReadAllText(rfile);
                        using (var sr = new StreamReader(rfile))
                        {
                            if (line.Contains("#predefine_end"))
                            {
                                while ((line = sr.ReadLine()) != null)
                                {
                                    if (line.Contains("#predefine_end"))
                                    {
                                        break;
                                    }
                                }
                            }

                            while ((line = sr.ReadLine()) != null)
                            {
                                sw.WriteLine(line);
                            }
                        }
                    }
                    SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\"");
                }
            }
            return(result);
        }
Exemple #3
0
        private void DoProcess(Func <FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary <int, CountItem> dic)
        {
            Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "...");

            var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value));

            var parser = new FastqReader();
            var writer = new FastqWriter();

            StreamWriter swCount = null;

            if (map.HasCountFile)
            {
                swCount = new StreamWriter(outputFile + ".dupcount");
                swCount.WriteLine("Query\tCount\tSequence");
            }

            try
            {
                int readcount = 0;
                var tmpFile   = outputFile + ".tmp";
                using (var sr = StreamUtils.GetReader(options.InputFile))
                {
                    using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz")))
                    {
                        FastqSequence seq;
                        while ((seq = parser.Parse(sr)) != null)
                        {
                            readcount++;
                            if (readcount % 100000 == 0)
                            {
                                Progress.SetMessage("{0} reads processed", readcount);
                            }

                            if (!accept(seq))
                            {
                                continue;
                            }

                            var name        = seq.Name;
                            var sequence    = seq.SeqString;
                            var score       = seq.Score;
                            var len         = sequence.Length;
                            var description = seq.Description;
                            var count       = map.GetCount(seq.Name);

                            if (map.HasCountFile)
                            {
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
                            }

                            CountItem item;
                            if (!dic.TryGetValue(sequence.Length, out item))
                            {
                                item = new CountItem();
                                dic[sequence.Length] = item;
                            }

                            string clipped;
                            if (sequence.EndsWith("CCAA"))
                            {
                                clipped    = "CCAA";
                                sequence   = sequence.Substring(0, sequence.Length - 4);
                                item.CCAA += count;
                            }
                            else if (sequence.EndsWith("CCA"))
                            {
                                clipped   = "CCA";
                                sequence  = sequence.Substring(0, sequence.Length - 3);
                                item.CCA += count;
                            }
                            else if (sequence.EndsWith("CC"))
                            {
                                bool isCCA;
                                if (ccaMap.TryGetValue(name, out isCCA) && isCCA)
                                {
                                    clipped  = "CC";
                                    sequence = sequence.Substring(0, sequence.Length - 2);
                                    item.CC += count;
                                }
                                else
                                {
                                    clipped      = string.Empty;
                                    item.notNTA += count;
                                }
                            }
                            else
                            {
                                clipped      = string.Empty;
                                item.notNTA += count;
                            }

                            if (!string.IsNullOrEmpty(clipped))
                            {
                                var newlen = sequence.Length;
                                seq.SeqString = sequence;
                                seq.Score     = score.Substring(0, newlen);
                                seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped);
                            }
                            else
                            {
                                seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG);
                            }
                            writer.Write(sw, seq);
                            if (map.HasCountFile)
                            {
                                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
                            }
                        }
                    }
                }

                File.Move(tmpFile, outputFile);
            }
            finally
            {
                if (map.HasCountFile)
                {
                    swCount.Close();
                }
            }
        }
    public override IEnumerable<string> Process()
    {
      var entries = (from line in File.ReadAllLines(options.InputFile)
                     let parts = line.Split('\t')
                     where parts.Length >= 3
                     select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2] }).ToList();

      var groups = entries.GroupBy(m => m.GroupName).ToList();

      var result = new List<string>();

      foreach (var group in groups)
      {
        var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount");
        result.Add(catfile);
        using (var sw = new StreamWriter(catfile))
        {
          sw.WriteLine("SampleName\tCategory\tLevel\tCount");

          foreach (var entry in group)
          {
            Progress.SetMessage("Reading smallRNA mapped info file " + entry.SmallRNAFile + " ...");

            var map = new MapItemReader(0, 1, hasHeader: false).ReadFromFile(entry.SmallRNAFile);

            var totalReads = Math.Round(double.Parse(map["TotalReads"].Value));
            var mappedReads = Math.Round(double.Parse(map["MappedReads"].Value));
            var smallRNAReads = Math.Round(double.Parse(map["FeatureReads"].Value));

            sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, TotalReadsKey, totalReads);
            sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, MappedReadsKey, mappedReads);
            sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

            sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, UnmappedKey, totalReads - mappedReads);
            sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, OtherMappedKey, mappedReads - smallRNAReads);
            sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, smallRNAKey, smallRNAReads);

            foreach (var biotype in SmallRNAConsts.Biotypes)
            {
              if (map.ContainsKey(biotype))
              {
                sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, biotype, 2, Math.Round(double.Parse(map[biotype].Value)));
              }
            }
          }
        }

        var data = (from line in File.ReadAllLines(catfile).Skip(1)
                    where !string.IsNullOrWhiteSpace(line)
                    let parts = line.Split('\t')
                    let level = double.Parse(parts[2])
                    where !(parts[1].Equals(smallRNAKey) && level == 1)
                    select new
                    {
                      SampleName = parts[0],
                      Category = parts[1],
                      Level = level,
                      Count = int.Parse(parts[3])
                    }).ToList();

        var tablefile = catfile + ".tsv";
        result.Add(tablefile);
        using (var sw = new StreamWriter(tablefile))
        {
          var samples = (from d in data
                         select d.SampleName).Distinct().OrderBy(m => m).ToList();

          sw.WriteLine("Category\t{0}", samples.Merge("\t"));

          var categories = new string[] { TotalReadsKey, MappedReadsKey, UnmappedKey, OtherMappedKey, smallRNAKey }.Union(SmallRNAConsts.Biotypes).ToList();

          Console.WriteLine(categories.Merge("\n"));

          var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category);
          foreach (var cat in categories)
          {
            sw.WriteLine("{0}\t{1}", cat,
              (from sample in samples
               let dic = map[sample]
               select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t"));
          }
        }

        var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName;
        if (File.Exists(rfile))
        {
          var targetrfile = catfile + ".r";
          using (var sw = new StreamWriter(targetrfile))
          {
            sw.WriteLine("catfile<-\"{0}\"", catfile);
            sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory);
            sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0");
            string line = File.ReadAllText(rfile);
            using (var sr = new StreamReader(rfile))
            {
              if (line.Contains("#predefine_end"))
              {
                while ((line = sr.ReadLine()) != null)
                {
                  if (line.Contains("#predefine_end"))
                  {
                    break;
                  }
                }
              }

              while ((line = sr.ReadLine()) != null)
              {
                sw.WriteLine(line);
              }
            }
          }
          SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\"");
        }
      }
      return result;
    }
        public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths)
        {
            Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile);
            var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile);
            var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length"));

            if (lengthIndex < 0)
            {
                throw new Exception("Cannot find length column in file " + options.GeneLengthFile);
            }
            var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));

            Progress.SetMessage("Reading count table from {0} ...", options.InputFile);
            var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile);

            if (!string.IsNullOrEmpty(options.KeyRegex))
            {
                var reg = new Regex(options.KeyRegex);
                geneLengthMap           = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value);
                counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value;
            }

            Dictionary <string, double> sampleReads;

            if (File.Exists(options.SampleReadsFile))
            {
                Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile);
                sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));
            }
            else //use total mapped reads as total reads
            {
                sampleReads = new Dictionary <string, double>();
                for (int iSample = 0; iSample < counts.Samples.Length; iSample++)
                {
                    double itotal = 0.0;
                    for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++)
                    {
                        itotal += counts.Count[iGene, iSample];
                    }

                    sampleReads[counts.Samples[iSample]] = itotal;
                }
            }

            foreach (var sample in counts.Samples)
            {
                if (!sampleReads.ContainsKey(sample))
                {
                    throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile));
                }
            }

            foreach (var geneValues in counts.GeneValues)
            {
                if (!geneLengthMap.ContainsKey(geneValues[0]))
                {
                    throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile));
                }
            }

            sampleCounts = (from sample in counts.Samples
                            select sampleReads[sample]).ToArray();

            geneLengths = (from geneValues in counts.GeneValues
                           select geneLengthMap[geneValues[0]]).ToArray();

            for (int iGene = 0; iGene < geneLengths.Length; iGene++)
            {
                for (int iSample = 0; iSample < sampleCounts.Length; iSample++)
                {
                    counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]);
                }
            }
            return(counts);
        }
    private void DoProcess(Func<FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary<int, CountItem> dic)
    {
      Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "...");

      var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value));

      var parser = new FastqReader();
      var writer = new FastqWriter();

      StreamWriter swCount = null;
      if (map.HasCountFile)
      {
        swCount = new StreamWriter(outputFile + ".dupcount");
        swCount.WriteLine("Query\tCount\tSequence");
      }

      try
      {
        int readcount = 0;
        var tmpFile = outputFile + ".tmp";
        using (var sr = StreamUtils.GetReader(options.InputFile))
        {
          using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz")))
          {
            FastqSequence seq;
            while ((seq = parser.Parse(sr)) != null)
            {
              readcount++;
              if (readcount % 100000 == 0)
              {
                Progress.SetMessage("{0} reads processed", readcount);
              }

              if (!accept(seq))
              {
                continue;
              }

              var name = seq.Name;
              var sequence = seq.SeqString;
              var score = seq.Score;
              var len = sequence.Length;
              var description = seq.Description;
              var count = map.GetCount(seq.Name);

              if (map.HasCountFile)
              {
                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
              }

              CountItem item;
              if (!dic.TryGetValue(sequence.Length, out item))
              {
                item = new CountItem();
                dic[sequence.Length] = item;
              }

              string clipped;
              if (sequence.EndsWith("CCAA"))
              {
                clipped = "CCAA";
                sequence = sequence.Substring(0, sequence.Length - 4);
                item.CCAA += count;
              }
              else if (sequence.EndsWith("CCA"))
              {
                clipped = "CCA";
                sequence = sequence.Substring(0, sequence.Length - 3);
                item.CCA += count;
              }
              else if (sequence.EndsWith("CC"))
              {
                bool isCCA;
                if (ccaMap.TryGetValue(name, out isCCA) && isCCA)
                {
                  clipped = "CC";
                  sequence = sequence.Substring(0, sequence.Length - 2);
                  item.CC += count;
                }
                else
                {
                  clipped = string.Empty;
                  item.notNTA += count;
                }
              }
              else
              {
                clipped = string.Empty;
                item.notNTA += count;
              }

              if (!string.IsNullOrEmpty(clipped))
              {
                var newlen = sequence.Length;
                seq.SeqString = sequence;
                seq.Score = score.Substring(0, newlen);
                seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped);
              }
              else
              {
                seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG);
              }
              writer.Write(sw, seq);
              if (map.HasCountFile)
              {
                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
              }
            }
          }
        }

        File.Move(tmpFile, outputFile);
      }
      finally
      {
        if (map.HasCountFile)
        {
          swCount.Close();
        }
      }
    }
    public override IEnumerable<string> Process()
    {
      var result = new List<string>();

      var countFiles = options.GetCountFiles();
      countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

      var format = new ChromosomeCountSlimItemXmlFormat(outputSample: true);

      var countMap = new Dictionary<string, ChromosomeCountSlimItem>();

      int fileIndex = 0;
      foreach (var file in countFiles)
      {
        fileIndex++;
        Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File);

        var curcounts = format.ReadFromFile(file.File);

        if (curcounts.Count > 0 && string.IsNullOrEmpty(curcounts[0].Queries[0].Sequence))
        {
          Console.WriteLine("Didn't read in the sequence of query " + curcounts[0].Queries[0].Qname);
        }
        curcounts.ForEach(m =>
        {
          foreach (var q in m.Queries)
          {
            q.Sample = file.Name;
          }
        });

        foreach (var c in curcounts)
        {
          var name = c.Names.First();
          ChromosomeCountSlimItem item;
          if (countMap.TryGetValue(name, out item))
          {
            item.Queries.AddRange(c.Queries);
          }
          else
          {
            countMap[name] = c;
          }
        }
      }

      var counts = countMap.Values.ToList();

      WriteOutput(options.OutputFile, countFiles, format, counts);

      result.Add(options.OutputFile);

      if (File.Exists(options.CategoryMapFile))
      {
        Progress.SetMessage("Reading category map ...");
        var categoryMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile);
        var queries = new HashSet<SAMChromosomeItem>(from c in counts
                                                     from q in c.Queries
                                                     select q);

        var dic = new Dictionary<string, ChromosomeCountSlimItem>();
        foreach (var q in queries)
        {
          q.Chromosomes = (from chrom in q.Chromosomes
                           select categoryMap[chrom].Value).Distinct().OrderBy(m => m).ToList();
          foreach (var chrom in q.Chromosomes)
          {
            ChromosomeCountSlimItem item;
            if (!dic.TryGetValue(chrom, out item))
            {
              item = new ChromosomeCountSlimItem();
              item.Names.Add(chrom);
              dic[chrom] = item;
            }
            item.Queries.Add(q);
          }
        }

        var catFile = Path.ChangeExtension(options.OutputFile, ".category" + Path.GetExtension(options.OutputFile));
        WriteOutput(catFile, countFiles, format, dic.Values.ToList());
        result.Add(catFile);
      }

      if (options.OutputReadTable || options.OutputReadContigTable)
      {
        Progress.SetMessage("Building sequence map...");
        var reads = SmallRNASequenceUtils.ConvertFrom(counts);

        if (options.OutputReadTable)
        {
          Progress.SetMessage("Saving read file...");
          var readOutput = Path.ChangeExtension(options.OutputFile, ".read" + Path.GetExtension(options.OutputFile));
          new SmallRNASequenceFormat(int.MaxValue, false).WriteToFile(readOutput, reads);
          result.Add(readOutput);
        }

        if (options.OutputReadContigTable)
        {
          Progress.SetMessage("Building sequence contig by similarity ...");
          var contigs = SmallRNASequenceUtils.BuildContigByIdenticalSimilarity(reads, options.MinimumOverlapRate, options.MaximumExtensionBase,  progress: Progress);

          Progress.SetMessage("Contig number = {0}", contigs.Count);

          Progress.SetMessage("Saving contig file...");
          var contigOutput = Path.ChangeExtension(options.OutputFile, ".contig" + Path.GetExtension(options.OutputFile));
          new SmallRNASequenceContigFormat().WriteToFile(contigOutput, contigs);
          result.Add(contigOutput);

          Progress.SetMessage("Saving sequence contig details...");
          new SmallRNASequenceContigDetailFormat().WriteToFile(contigOutput + ".details", contigs);
          result.Add(contigOutput + ".details");
        }
      }

      Progress.End();

      return result;
    }
    public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths)
    {
      Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile);
      var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile);
      var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length"));
      if (lengthIndex < 0)
      {
        throw new Exception("Cannot find length column in file " + options.GeneLengthFile);
      }
      var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));

      Progress.SetMessage("Reading count table from {0} ...", options.InputFile);
      var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile);

      if (!string.IsNullOrEmpty(options.KeyRegex))
      {
        var reg = new Regex(options.KeyRegex);
        geneLengthMap = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value);
        counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value;
      }

      Dictionary<string, double> sampleReads;
      if (File.Exists(options.SampleReadsFile))
      {
        Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile);
        sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));
      }
      else //use total mapped reads as total reads
      {
        sampleReads = new Dictionary<string, double>();
        for (int iSample = 0; iSample < counts.Samples.Length; iSample++)
        {
          double itotal = 0.0;
          for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++)
          {
            itotal += counts.Count[iGene, iSample];
          }

          sampleReads[counts.Samples[iSample]] = itotal;
        }
      }

      foreach (var sample in counts.Samples)
      {
        if (!sampleReads.ContainsKey(sample))
        {
          throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile));
        }
      }

      foreach (var geneValues in counts.GeneValues)
      {
        if (!geneLengthMap.ContainsKey(geneValues[0]))
        {
          throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile));
        }
      }

      sampleCounts = (from sample in counts.Samples
                          select sampleReads[sample]).ToArray();

      geneLengths = (from geneValues in counts.GeneValues
                     select geneLengthMap[geneValues[0]]).ToArray();

      for (int iGene = 0; iGene < geneLengths.Length; iGene++)
      {
        for (int iSample = 0; iSample < sampleCounts.Length; iSample++)
        {
          counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]);
        }
      }
      return counts;
    }
Exemple #9
0
        public override IEnumerable <string> Process()
        {
            var countFiles = options.GetCountFiles();

            countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

            var countMap  = new Dictionary <string, Dictionary <string, int> >();
            int fileIndex = 0;

            foreach (var file in countFiles)
            {
                fileIndex++;
                Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File);

                var queries = new HashSet <string>();
                using (var sr = SAMFactory.GetReader(file.File, true))
                {
                    int    count = 0;
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        count++;

                        if (count % 1000 == 0)
                        {
                            if (Progress.IsCancellationPending())
                            {
                                throw new UserTerminatedException();
                            }
                        }

                        var parts = line.Split('\t');

                        SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);

                        //unmatched
                        if (flag.HasFlag(SAMFlags.UnmappedQuery))
                        {
                            continue;
                        }

                        queries.Add(parts[SAMFormatConst.QNAME_INDEX]);
                    }
                }

                var countDic = new Dictionary <string, int>();
                countMap[file.Name] = countDic;
                var cm = new MapItemReader(0, 1, informationIndex: 2).ReadFromFile(file.AdditionalFile);
                foreach (var query in queries)
                {
                    var count = cm[query];
                    countDic[count.Information] = int.Parse(count.Value);
                }

                Progress.SetMessage("{0} reads mapped.", queries.Count);
            }

            var uniques = (from c in countMap.Values
                           from seq in c.Keys
                           select seq).Distinct().ToArray();
            var uniqueCounts = (from seq in uniques
                                let totalCount = (from c in countMap.Values
                                                  where c.ContainsKey(seq)
                                                  select c[seq]).Sum()
                                                 select new { Sequence = seq, Count = totalCount }).OrderByDescending(m => m.Count).ToArray();

            using (var sw = new StreamWriter(options.OutputFile))
            {
                sw.WriteLine("Sequence\t" + (from cf in countFiles select cf.Name).Merge("\t"));
                foreach (var uc in uniqueCounts)
                {
                    var seq = uc.Sequence;
                    sw.Write(seq);
                    foreach (var cf in countFiles)
                    {
                        var map = countMap[cf.Name];
                        int count;
                        if (map.TryGetValue(seq, out count))
                        {
                            sw.Write("\t{0}", count);
                        }
                        else
                        {
                            sw.Write("\t0");
                        }
                    }
                    sw.WriteLine();
                }
            }

            Progress.End();

            return(new string[] { Path.GetFullPath(options.OutputFile) });
        }
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            var countFiles = options.GetCountFiles();

            countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

            var format = new ChromosomeCountSlimItemXmlFormat(outputSample: true);

            var countMap = new Dictionary <string, ChromosomeCountSlimItem>();

            int fileIndex = 0;

            foreach (var file in countFiles)
            {
                fileIndex++;
                Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File);

                var curcounts = format.ReadFromFile(file.File);

                if (curcounts.Count > 0 && string.IsNullOrEmpty(curcounts[0].Queries[0].Sequence))
                {
                    Console.WriteLine("Didn't read in the sequence of query " + curcounts[0].Queries[0].Qname);
                }
                curcounts.ForEach(m =>
                {
                    foreach (var q in m.Queries)
                    {
                        q.Sample = file.Name;
                    }
                });

                foreach (var c in curcounts)
                {
                    var name = c.Names.First();
                    ChromosomeCountSlimItem item;
                    if (countMap.TryGetValue(name, out item))
                    {
                        item.Queries.AddRange(c.Queries);
                    }
                    else
                    {
                        countMap[name] = c;
                    }
                }
            }

            var counts = countMap.Values.ToList();

            WriteOutput(options.OutputFile, countFiles, format, counts);

            result.Add(options.OutputFile);

            if (File.Exists(options.CategoryMapFile))
            {
                Progress.SetMessage("Reading category map ...");
                var categoryMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile);
                var queries     = new HashSet <SAMChromosomeItem>(from c in counts
                                                                  from q in c.Queries
                                                                  select q);

                var dic = new Dictionary <string, ChromosomeCountSlimItem>();
                foreach (var q in queries)
                {
                    q.Chromosomes = (from chrom in q.Chromosomes
                                     select categoryMap[chrom].Value).Distinct().OrderBy(m => m).ToList();
                    foreach (var chrom in q.Chromosomes)
                    {
                        ChromosomeCountSlimItem item;
                        if (!dic.TryGetValue(chrom, out item))
                        {
                            item = new ChromosomeCountSlimItem();
                            item.Names.Add(chrom);
                            dic[chrom] = item;
                        }
                        item.Queries.Add(q);
                    }
                }

                var catFile = Path.ChangeExtension(options.OutputFile, ".category" + Path.GetExtension(options.OutputFile));
                WriteOutput(catFile, countFiles, format, dic.Values.ToList());
                result.Add(catFile);
            }

            if (options.OutputReadTable || options.OutputReadContigTable)
            {
                Progress.SetMessage("Building sequence map...");
                var reads = SmallRNASequenceUtils.ConvertFrom(counts);

                if (options.OutputReadTable)
                {
                    Progress.SetMessage("Saving read file...");
                    var readOutput = Path.ChangeExtension(options.OutputFile, ".read" + Path.GetExtension(options.OutputFile));
                    new SmallRNASequenceFormat(int.MaxValue, false).WriteToFile(readOutput, reads);
                    result.Add(readOutput);
                }

                if (options.OutputReadContigTable)
                {
                    Progress.SetMessage("Building sequence contig by similarity ...");
                    var contigs = SmallRNASequenceUtils.BuildContigByIdenticalSimilarity(reads, options.MinimumOverlapRate, options.MaximumExtensionBase, progress: Progress);

                    Progress.SetMessage("Contig number = {0}", contigs.Count);

                    Progress.SetMessage("Saving contig file...");
                    var contigOutput = Path.ChangeExtension(options.OutputFile, ".contig" + Path.GetExtension(options.OutputFile));
                    new SmallRNASequenceContigFormat().WriteToFile(contigOutput, contigs);
                    result.Add(contigOutput);

                    Progress.SetMessage("Saving sequence contig details...");
                    new SmallRNASequenceContigDetailFormat().WriteToFile(contigOutput + ".details", contigs);
                    result.Add(contigOutput + ".details");
                }
            }

            Progress.End();

            return(result);
        }