public override IEnumerable <string> Process() { var entries = (from line in File.ReadAllLines(options.InputFile) let parts = line.Split('\t') where parts.Length >= 3 select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2] }).ToList(); var groups = entries.GroupBy(m => m.GroupName).ToList(); var result = new List <string>(); foreach (var group in groups) { var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount"); result.Add(catfile); using (var sw = new StreamWriter(catfile)) { sw.WriteLine("SampleName\tCategory\tLevel\tCount"); foreach (var entry in group) { Progress.SetMessage("Reading smallRNA mapped info file " + entry.SmallRNAFile + " ..."); var map = new MapItemReader(0, 1, hasHeader: false).ReadFromFile(entry.SmallRNAFile); var totalReads = Math.Round(double.Parse(map["TotalReads"].Value)); var mappedReads = Math.Round(double.Parse(map["MappedReads"].Value)); var smallRNAReads = Math.Round(double.Parse(map["FeatureReads"].Value)); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, TotalReadsKey, totalReads); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, MappedReadsKey, mappedReads); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, smallRNAKey, smallRNAReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, UnmappedKey, totalReads - mappedReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, OtherMappedKey, mappedReads - smallRNAReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, smallRNAKey, smallRNAReads); foreach (var biotype in SmallRNAConsts.Biotypes) { if (map.ContainsKey(biotype)) { sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, biotype, 2, Math.Round(double.Parse(map[biotype].Value))); } } } } var data = (from line in File.ReadAllLines(catfile).Skip(1) where !string.IsNullOrWhiteSpace(line) let parts = line.Split('\t') let level = double.Parse(parts[2]) where !(parts[1].Equals(smallRNAKey) && level == 1) select new { SampleName = parts[0], Category = parts[1], Level = level, Count = int.Parse(parts[3]) }).ToList(); var tablefile = catfile + ".tsv"; result.Add(tablefile); using (var sw = new StreamWriter(tablefile)) { var samples = (from d in data select d.SampleName).Distinct().OrderBy(m => m).ToList(); sw.WriteLine("Category\t{0}", samples.Merge("\t")); var categories = new string[] { TotalReadsKey, MappedReadsKey, UnmappedKey, OtherMappedKey, smallRNAKey }.Union(SmallRNAConsts.Biotypes).ToList(); Console.WriteLine(categories.Merge("\n")); var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category); foreach (var cat in categories) { sw.WriteLine("{0}\t{1}", cat, (from sample in samples let dic = map[sample] select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t")); } } var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName; if (File.Exists(rfile)) { var targetrfile = catfile + ".r"; using (var sw = new StreamWriter(targetrfile)) { sw.WriteLine("catfile<-\"{0}\"", catfile); sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory); sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0"); string line = File.ReadAllText(rfile); using (var sr = new StreamReader(rfile)) { if (line.Contains("#predefine_end")) { while ((line = sr.ReadLine()) != null) { if (line.Contains("#predefine_end")) { break; } } } while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\""); } } return(result); }
public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths) { Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile); var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile); var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length")); if (lengthIndex < 0) { throw new Exception("Cannot find length column in file " + options.GeneLengthFile); } var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value)); Progress.SetMessage("Reading count table from {0} ...", options.InputFile); var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile); if (!string.IsNullOrEmpty(options.KeyRegex)) { var reg = new Regex(options.KeyRegex); geneLengthMap = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value); counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value; } Dictionary <string, double> sampleReads; if (File.Exists(options.SampleReadsFile)) { Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile); sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value)); } else //use total mapped reads as total reads { sampleReads = new Dictionary <string, double>(); for (int iSample = 0; iSample < counts.Samples.Length; iSample++) { double itotal = 0.0; for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++) { itotal += counts.Count[iGene, iSample]; } sampleReads[counts.Samples[iSample]] = itotal; } } foreach (var sample in counts.Samples) { if (!sampleReads.ContainsKey(sample)) { throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile)); } } foreach (var geneValues in counts.GeneValues) { if (!geneLengthMap.ContainsKey(geneValues[0])) { throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile)); } } sampleCounts = (from sample in counts.Samples select sampleReads[sample]).ToArray(); geneLengths = (from geneValues in counts.GeneValues select geneLengthMap[geneValues[0]]).ToArray(); for (int iGene = 0; iGene < geneLengths.Length; iGene++) { for (int iSample = 0; iSample < sampleCounts.Length; iSample++) { counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]); } } return(counts); }
public override IEnumerable<string> Process() { var entries = (from line in File.ReadAllLines(options.InputFile) let parts = line.Split('\t') where parts.Length >= 3 select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2] }).ToList(); var groups = entries.GroupBy(m => m.GroupName).ToList(); var result = new List<string>(); foreach (var group in groups) { var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount"); result.Add(catfile); using (var sw = new StreamWriter(catfile)) { sw.WriteLine("SampleName\tCategory\tLevel\tCount"); foreach (var entry in group) { Progress.SetMessage("Reading smallRNA mapped info file " + entry.SmallRNAFile + " ..."); var map = new MapItemReader(0, 1, hasHeader: false).ReadFromFile(entry.SmallRNAFile); var totalReads = Math.Round(double.Parse(map["TotalReads"].Value)); var mappedReads = Math.Round(double.Parse(map["MappedReads"].Value)); var smallRNAReads = Math.Round(double.Parse(map["FeatureReads"].Value)); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, TotalReadsKey, totalReads); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, MappedReadsKey, mappedReads); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, smallRNAKey, smallRNAReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, UnmappedKey, totalReads - mappedReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, OtherMappedKey, mappedReads - smallRNAReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, smallRNAKey, smallRNAReads); foreach (var biotype in SmallRNAConsts.Biotypes) { if (map.ContainsKey(biotype)) { sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, biotype, 2, Math.Round(double.Parse(map[biotype].Value))); } } } } var data = (from line in File.ReadAllLines(catfile).Skip(1) where !string.IsNullOrWhiteSpace(line) let parts = line.Split('\t') let level = double.Parse(parts[2]) where !(parts[1].Equals(smallRNAKey) && level == 1) select new { SampleName = parts[0], Category = parts[1], Level = level, Count = int.Parse(parts[3]) }).ToList(); var tablefile = catfile + ".tsv"; result.Add(tablefile); using (var sw = new StreamWriter(tablefile)) { var samples = (from d in data select d.SampleName).Distinct().OrderBy(m => m).ToList(); sw.WriteLine("Category\t{0}", samples.Merge("\t")); var categories = new string[] { TotalReadsKey, MappedReadsKey, UnmappedKey, OtherMappedKey, smallRNAKey }.Union(SmallRNAConsts.Biotypes).ToList(); Console.WriteLine(categories.Merge("\n")); var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category); foreach (var cat in categories) { sw.WriteLine("{0}\t{1}", cat, (from sample in samples let dic = map[sample] select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t")); } } var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName; if (File.Exists(rfile)) { var targetrfile = catfile + ".r"; using (var sw = new StreamWriter(targetrfile)) { sw.WriteLine("catfile<-\"{0}\"", catfile); sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory); sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0"); string line = File.ReadAllText(rfile); using (var sr = new StreamReader(rfile)) { if (line.Contains("#predefine_end")) { while ((line = sr.ReadLine()) != null) { if (line.Contains("#predefine_end")) { break; } } } while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\""); } } return result; }
public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths) { Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile); var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile); var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length")); if (lengthIndex < 0) { throw new Exception("Cannot find length column in file " + options.GeneLengthFile); } var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value)); Progress.SetMessage("Reading count table from {0} ...", options.InputFile); var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile); if (!string.IsNullOrEmpty(options.KeyRegex)) { var reg = new Regex(options.KeyRegex); geneLengthMap = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value); counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value; } Dictionary<string, double> sampleReads; if (File.Exists(options.SampleReadsFile)) { Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile); sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value)); } else //use total mapped reads as total reads { sampleReads = new Dictionary<string, double>(); for (int iSample = 0; iSample < counts.Samples.Length; iSample++) { double itotal = 0.0; for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++) { itotal += counts.Count[iGene, iSample]; } sampleReads[counts.Samples[iSample]] = itotal; } } foreach (var sample in counts.Samples) { if (!sampleReads.ContainsKey(sample)) { throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile)); } } foreach (var geneValues in counts.GeneValues) { if (!geneLengthMap.ContainsKey(geneValues[0])) { throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile)); } } sampleCounts = (from sample in counts.Samples select sampleReads[sample]).ToArray(); geneLengths = (from geneValues in counts.GeneValues select geneLengthMap[geneValues[0]]).ToArray(); for (int iGene = 0; iGene < geneLengths.Length; iGene++) { for (int iSample = 0; iSample < sampleCounts.Length; iSample++) { counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]); } } return counts; }