Beispiel #1
0
    public void TestReadFromFile()
    {
      var counts = new GeneCountTableFormat().ReadFromFile(@"../../../data/genetable.tsv");

      Assert.AreEqual(2, counts.GeneValues.Count);
      Assert.AreEqual(4, counts.Samples.Length);

      Assert.AreEqual("ENSG00000000003", counts.GeneValues[0][0]);
      Assert.AreEqual("TSPAN6", counts.GeneValues[0][1]);
      Assert.AreEqual("IG-001", counts.Samples[0]);

      Assert.AreEqual(128, counts.Count[0, 0]);
      Assert.AreEqual(60, counts.Count[0, 1]);
      Assert.AreEqual(288, counts.Count[0, 2]);
      Assert.AreEqual(9, counts.Count[0, 3]);
    }
    public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths)
    {
      Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile);
      var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile);
      var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length"));
      if (lengthIndex < 0)
      {
        throw new Exception("Cannot find length column in file " + options.GeneLengthFile);
      }
      var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));

      Progress.SetMessage("Reading count table from {0} ...", options.InputFile);
      var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile);

      if (!string.IsNullOrEmpty(options.KeyRegex))
      {
        var reg = new Regex(options.KeyRegex);
        geneLengthMap = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value);
        counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value;
      }

      Dictionary<string, double> sampleReads;
      if (File.Exists(options.SampleReadsFile))
      {
        Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile);
        sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value));
      }
      else //use total mapped reads as total reads
      {
        sampleReads = new Dictionary<string, double>();
        for (int iSample = 0; iSample < counts.Samples.Length; iSample++)
        {
          double itotal = 0.0;
          for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++)
          {
            itotal += counts.Count[iGene, iSample];
          }

          sampleReads[counts.Samples[iSample]] = itotal;
        }
      }

      foreach (var sample in counts.Samples)
      {
        if (!sampleReads.ContainsKey(sample))
        {
          throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile));
        }
      }

      foreach (var geneValues in counts.GeneValues)
      {
        if (!geneLengthMap.ContainsKey(geneValues[0]))
        {
          throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile));
        }
      }

      sampleCounts = (from sample in counts.Samples
                          select sampleReads[sample]).ToArray();

      geneLengths = (from geneValues in counts.GeneValues
                     select geneLengthMap[geneValues[0]]).ToArray();

      for (int iGene = 0; iGene < geneLengths.Length; iGene++)
      {
        for (int iSample = 0; iSample < sampleCounts.Length; iSample++)
        {
          counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]);
        }
      }
      return counts;
    }