public static void SummarizeExtractedData(string dir) { var designs = Directory.GetFiles(dir, "*.design.tsv"); using (var sw = new StreamWriter(dir + "\\design_overlap.tsv")) using (var swSummary = new StreamWriter(dir + "\\design_summary.tsv")) { var reader = new MapItemReader(2, 4); var data = (from design in designs from item in reader.ReadFromFile(design) select item).ToList(); var dataMap = data.ToGroupDictionary(m => m.Key); var samples = (from d in data select d.Key).Distinct().OrderBy(m => m).ToList(); var platforms = (from d in data select d.Value.Value).Distinct().ToList(); var dMap = (from d in data select new { Sample = d.Key, Platform = d.Value.Value }).ToDoubleDictionary(m => m.Platform, m => m.Sample); sw.WriteLine("Sample\t" + platforms.Merge("\t")); foreach (var sample in samples) { var sampleMap = new HashSet <string>(dataMap[sample].ConvertAll(m => m.Value.Value)); sw.WriteLine("{0}\t{1}", sample, (from p in platforms select sampleMap.Contains(p) ? "+" : "").Merge("\t")); } swSummary.WriteLine("\t" + platforms.Merge("\t")); for (int i = 0; i < platforms.Count; i++) { swSummary.Write(platforms[i]); for (int j = 0; j < platforms.Count; j++) { swSummary.Write("\t{0}", dMap[platforms[i]].Keys.Intersect(dMap[platforms[j]].Keys).Count()); } swSummary.WriteLine(); } } }
public override IEnumerable <string> Process() { var entries = (from line in File.ReadAllLines(options.InputFile) let parts = line.Split('\t') where parts.Length >= 3 select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2] }).ToList(); var groups = entries.GroupBy(m => m.GroupName).ToList(); var result = new List <string>(); foreach (var group in groups) { var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount"); result.Add(catfile); using (var sw = new StreamWriter(catfile)) { sw.WriteLine("SampleName\tCategory\tLevel\tCount"); foreach (var entry in group) { Progress.SetMessage("Reading smallRNA mapped info file " + entry.SmallRNAFile + " ..."); var map = new MapItemReader(0, 1, hasHeader: false).ReadFromFile(entry.SmallRNAFile); var totalReads = Math.Round(double.Parse(map["TotalReads"].Value)); var mappedReads = Math.Round(double.Parse(map["MappedReads"].Value)); var smallRNAReads = Math.Round(double.Parse(map["FeatureReads"].Value)); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, TotalReadsKey, totalReads); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, MappedReadsKey, mappedReads); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, smallRNAKey, smallRNAReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, UnmappedKey, totalReads - mappedReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, OtherMappedKey, mappedReads - smallRNAReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, smallRNAKey, smallRNAReads); foreach (var biotype in SmallRNAConsts.Biotypes) { if (map.ContainsKey(biotype)) { sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, biotype, 2, Math.Round(double.Parse(map[biotype].Value))); } } } } var data = (from line in File.ReadAllLines(catfile).Skip(1) where !string.IsNullOrWhiteSpace(line) let parts = line.Split('\t') let level = double.Parse(parts[2]) where !(parts[1].Equals(smallRNAKey) && level == 1) select new { SampleName = parts[0], Category = parts[1], Level = level, Count = int.Parse(parts[3]) }).ToList(); var tablefile = catfile + ".tsv"; result.Add(tablefile); using (var sw = new StreamWriter(tablefile)) { var samples = (from d in data select d.SampleName).Distinct().OrderBy(m => m).ToList(); sw.WriteLine("Category\t{0}", samples.Merge("\t")); var categories = new string[] { TotalReadsKey, MappedReadsKey, UnmappedKey, OtherMappedKey, smallRNAKey }.Union(SmallRNAConsts.Biotypes).ToList(); Console.WriteLine(categories.Merge("\n")); var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category); foreach (var cat in categories) { sw.WriteLine("{0}\t{1}", cat, (from sample in samples let dic = map[sample] select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t")); } } var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName; if (File.Exists(rfile)) { var targetrfile = catfile + ".r"; using (var sw = new StreamWriter(targetrfile)) { sw.WriteLine("catfile<-\"{0}\"", catfile); sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory); sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0"); string line = File.ReadAllText(rfile); using (var sr = new StreamReader(rfile)) { if (line.Contains("#predefine_end")) { while ((line = sr.ReadLine()) != null) { if (line.Contains("#predefine_end")) { break; } } } while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\""); } } return(result); }
private void DoProcess(Func <FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary <int, CountItem> dic) { Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "..."); var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value)); var parser = new FastqReader(); var writer = new FastqWriter(); StreamWriter swCount = null; if (map.HasCountFile) { swCount = new StreamWriter(outputFile + ".dupcount"); swCount.WriteLine("Query\tCount\tSequence"); } try { int readcount = 0; var tmpFile = outputFile + ".tmp"; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz"))) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } if (!accept(seq)) { continue; } var name = seq.Name; var sequence = seq.SeqString; var score = seq.Score; var len = sequence.Length; var description = seq.Description; var count = map.GetCount(seq.Name); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } CountItem item; if (!dic.TryGetValue(sequence.Length, out item)) { item = new CountItem(); dic[sequence.Length] = item; } string clipped; if (sequence.EndsWith("CCAA")) { clipped = "CCAA"; sequence = sequence.Substring(0, sequence.Length - 4); item.CCAA += count; } else if (sequence.EndsWith("CCA")) { clipped = "CCA"; sequence = sequence.Substring(0, sequence.Length - 3); item.CCA += count; } else if (sequence.EndsWith("CC")) { bool isCCA; if (ccaMap.TryGetValue(name, out isCCA) && isCCA) { clipped = "CC"; sequence = sequence.Substring(0, sequence.Length - 2); item.CC += count; } else { clipped = string.Empty; item.notNTA += count; } } else { clipped = string.Empty; item.notNTA += count; } if (!string.IsNullOrEmpty(clipped)) { var newlen = sequence.Length; seq.SeqString = sequence; seq.Score = score.Substring(0, newlen); seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped); } else { seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG); } writer.Write(sw, seq); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } } } } File.Move(tmpFile, outputFile); } finally { if (map.HasCountFile) { swCount.Close(); } } }
public override IEnumerable<string> Process() { var entries = (from line in File.ReadAllLines(options.InputFile) let parts = line.Split('\t') where parts.Length >= 3 select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2] }).ToList(); var groups = entries.GroupBy(m => m.GroupName).ToList(); var result = new List<string>(); foreach (var group in groups) { var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount"); result.Add(catfile); using (var sw = new StreamWriter(catfile)) { sw.WriteLine("SampleName\tCategory\tLevel\tCount"); foreach (var entry in group) { Progress.SetMessage("Reading smallRNA mapped info file " + entry.SmallRNAFile + " ..."); var map = new MapItemReader(0, 1, hasHeader: false).ReadFromFile(entry.SmallRNAFile); var totalReads = Math.Round(double.Parse(map["TotalReads"].Value)); var mappedReads = Math.Round(double.Parse(map["MappedReads"].Value)); var smallRNAReads = Math.Round(double.Parse(map["FeatureReads"].Value)); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, TotalReadsKey, totalReads); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, MappedReadsKey, mappedReads); sw.WriteLine("{0}\t{1}\t0\t{2}", entry.SampleName, smallRNAKey, smallRNAReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, UnmappedKey, totalReads - mappedReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, OtherMappedKey, mappedReads - smallRNAReads); sw.WriteLine("{0}\t{1}\t1\t{2}", entry.SampleName, smallRNAKey, smallRNAReads); foreach (var biotype in SmallRNAConsts.Biotypes) { if (map.ContainsKey(biotype)) { sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, biotype, 2, Math.Round(double.Parse(map[biotype].Value))); } } } } var data = (from line in File.ReadAllLines(catfile).Skip(1) where !string.IsNullOrWhiteSpace(line) let parts = line.Split('\t') let level = double.Parse(parts[2]) where !(parts[1].Equals(smallRNAKey) && level == 1) select new { SampleName = parts[0], Category = parts[1], Level = level, Count = int.Parse(parts[3]) }).ToList(); var tablefile = catfile + ".tsv"; result.Add(tablefile); using (var sw = new StreamWriter(tablefile)) { var samples = (from d in data select d.SampleName).Distinct().OrderBy(m => m).ToList(); sw.WriteLine("Category\t{0}", samples.Merge("\t")); var categories = new string[] { TotalReadsKey, MappedReadsKey, UnmappedKey, OtherMappedKey, smallRNAKey }.Union(SmallRNAConsts.Biotypes).ToList(); Console.WriteLine(categories.Merge("\n")); var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category); foreach (var cat in categories) { sw.WriteLine("{0}\t{1}", cat, (from sample in samples let dic = map[sample] select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t")); } } var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName; if (File.Exists(rfile)) { var targetrfile = catfile + ".r"; using (var sw = new StreamWriter(targetrfile)) { sw.WriteLine("catfile<-\"{0}\"", catfile); sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory); sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0"); string line = File.ReadAllText(rfile); using (var sr = new StreamReader(rfile)) { if (line.Contains("#predefine_end")) { while ((line = sr.ReadLine()) != null) { if (line.Contains("#predefine_end")) { break; } } } while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\""); } } return result; }
public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths) { Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile); var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile); var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length")); if (lengthIndex < 0) { throw new Exception("Cannot find length column in file " + options.GeneLengthFile); } var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value)); Progress.SetMessage("Reading count table from {0} ...", options.InputFile); var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile); if (!string.IsNullOrEmpty(options.KeyRegex)) { var reg = new Regex(options.KeyRegex); geneLengthMap = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value); counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value; } Dictionary <string, double> sampleReads; if (File.Exists(options.SampleReadsFile)) { Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile); sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value)); } else //use total mapped reads as total reads { sampleReads = new Dictionary <string, double>(); for (int iSample = 0; iSample < counts.Samples.Length; iSample++) { double itotal = 0.0; for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++) { itotal += counts.Count[iGene, iSample]; } sampleReads[counts.Samples[iSample]] = itotal; } } foreach (var sample in counts.Samples) { if (!sampleReads.ContainsKey(sample)) { throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile)); } } foreach (var geneValues in counts.GeneValues) { if (!geneLengthMap.ContainsKey(geneValues[0])) { throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile)); } } sampleCounts = (from sample in counts.Samples select sampleReads[sample]).ToArray(); geneLengths = (from geneValues in counts.GeneValues select geneLengthMap[geneValues[0]]).ToArray(); for (int iGene = 0; iGene < geneLengths.Length; iGene++) { for (int iSample = 0; iSample < sampleCounts.Length; iSample++) { counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]); } } return(counts); }
private void DoProcess(Func<FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary<int, CountItem> dic) { Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "..."); var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value)); var parser = new FastqReader(); var writer = new FastqWriter(); StreamWriter swCount = null; if (map.HasCountFile) { swCount = new StreamWriter(outputFile + ".dupcount"); swCount.WriteLine("Query\tCount\tSequence"); } try { int readcount = 0; var tmpFile = outputFile + ".tmp"; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz"))) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } if (!accept(seq)) { continue; } var name = seq.Name; var sequence = seq.SeqString; var score = seq.Score; var len = sequence.Length; var description = seq.Description; var count = map.GetCount(seq.Name); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } CountItem item; if (!dic.TryGetValue(sequence.Length, out item)) { item = new CountItem(); dic[sequence.Length] = item; } string clipped; if (sequence.EndsWith("CCAA")) { clipped = "CCAA"; sequence = sequence.Substring(0, sequence.Length - 4); item.CCAA += count; } else if (sequence.EndsWith("CCA")) { clipped = "CCA"; sequence = sequence.Substring(0, sequence.Length - 3); item.CCA += count; } else if (sequence.EndsWith("CC")) { bool isCCA; if (ccaMap.TryGetValue(name, out isCCA) && isCCA) { clipped = "CC"; sequence = sequence.Substring(0, sequence.Length - 2); item.CC += count; } else { clipped = string.Empty; item.notNTA += count; } } else { clipped = string.Empty; item.notNTA += count; } if (!string.IsNullOrEmpty(clipped)) { var newlen = sequence.Length; seq.SeqString = sequence; seq.Score = score.Substring(0, newlen); seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped); } else { seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG); } writer.Write(sw, seq); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } } } } File.Move(tmpFile, outputFile); } finally { if (map.HasCountFile) { swCount.Close(); } } }
public override IEnumerable<string> Process() { var result = new List<string>(); var countFiles = options.GetCountFiles(); countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name)); var format = new ChromosomeCountSlimItemXmlFormat(outputSample: true); var countMap = new Dictionary<string, ChromosomeCountSlimItem>(); int fileIndex = 0; foreach (var file in countFiles) { fileIndex++; Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File); var curcounts = format.ReadFromFile(file.File); if (curcounts.Count > 0 && string.IsNullOrEmpty(curcounts[0].Queries[0].Sequence)) { Console.WriteLine("Didn't read in the sequence of query " + curcounts[0].Queries[0].Qname); } curcounts.ForEach(m => { foreach (var q in m.Queries) { q.Sample = file.Name; } }); foreach (var c in curcounts) { var name = c.Names.First(); ChromosomeCountSlimItem item; if (countMap.TryGetValue(name, out item)) { item.Queries.AddRange(c.Queries); } else { countMap[name] = c; } } } var counts = countMap.Values.ToList(); WriteOutput(options.OutputFile, countFiles, format, counts); result.Add(options.OutputFile); if (File.Exists(options.CategoryMapFile)) { Progress.SetMessage("Reading category map ..."); var categoryMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile); var queries = new HashSet<SAMChromosomeItem>(from c in counts from q in c.Queries select q); var dic = new Dictionary<string, ChromosomeCountSlimItem>(); foreach (var q in queries) { q.Chromosomes = (from chrom in q.Chromosomes select categoryMap[chrom].Value).Distinct().OrderBy(m => m).ToList(); foreach (var chrom in q.Chromosomes) { ChromosomeCountSlimItem item; if (!dic.TryGetValue(chrom, out item)) { item = new ChromosomeCountSlimItem(); item.Names.Add(chrom); dic[chrom] = item; } item.Queries.Add(q); } } var catFile = Path.ChangeExtension(options.OutputFile, ".category" + Path.GetExtension(options.OutputFile)); WriteOutput(catFile, countFiles, format, dic.Values.ToList()); result.Add(catFile); } if (options.OutputReadTable || options.OutputReadContigTable) { Progress.SetMessage("Building sequence map..."); var reads = SmallRNASequenceUtils.ConvertFrom(counts); if (options.OutputReadTable) { Progress.SetMessage("Saving read file..."); var readOutput = Path.ChangeExtension(options.OutputFile, ".read" + Path.GetExtension(options.OutputFile)); new SmallRNASequenceFormat(int.MaxValue, false).WriteToFile(readOutput, reads); result.Add(readOutput); } if (options.OutputReadContigTable) { Progress.SetMessage("Building sequence contig by similarity ..."); var contigs = SmallRNASequenceUtils.BuildContigByIdenticalSimilarity(reads, options.MinimumOverlapRate, options.MaximumExtensionBase, progress: Progress); Progress.SetMessage("Contig number = {0}", contigs.Count); Progress.SetMessage("Saving contig file..."); var contigOutput = Path.ChangeExtension(options.OutputFile, ".contig" + Path.GetExtension(options.OutputFile)); new SmallRNASequenceContigFormat().WriteToFile(contigOutput, contigs); result.Add(contigOutput); Progress.SetMessage("Saving sequence contig details..."); new SmallRNASequenceContigDetailFormat().WriteToFile(contigOutput + ".details", contigs); result.Add(contigOutput + ".details"); } } Progress.End(); return result; }
public GeneCountTable CalculateFPKM(out double[] sampleCounts, out double[] geneLengths) { Progress.SetMessage("Reading gene length from {0} ...", options.GeneLengthFile); var columnNames = FileUtils.ReadColumnNames(options.GeneLengthFile); var lengthIndex = columnNames.ToList().FindIndex(m => m.ToLower().Equals("length")); if (lengthIndex < 0) { throw new Exception("Cannot find length column in file " + options.GeneLengthFile); } var geneLengthMap = new MapItemReader(0, lengthIndex).ReadFromFile(options.GeneLengthFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value)); Progress.SetMessage("Reading count table from {0} ...", options.InputFile); var counts = new GeneCountTableFormat().ReadFromFile(options.InputFile); if (!string.IsNullOrEmpty(options.KeyRegex)) { var reg = new Regex(options.KeyRegex); geneLengthMap = geneLengthMap.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value); counts.GeneValues[0][0] = reg.Match(counts.GeneValues[0][0]).Groups[1].Value; } Dictionary<string, double> sampleReads; if (File.Exists(options.SampleReadsFile)) { Progress.SetMessage("Reading sample reads from {0} ...", options.SampleReadsFile); sampleReads = new MapItemReader(0, 1).ReadFromFile(options.SampleReadsFile).ToDictionary(m => m.Key, m => double.Parse(m.Value.Value)); } else //use total mapped reads as total reads { sampleReads = new Dictionary<string, double>(); for (int iSample = 0; iSample < counts.Samples.Length; iSample++) { double itotal = 0.0; for (int iGene = 0; iGene < counts.GeneValues.Count; iGene++) { itotal += counts.Count[iGene, iSample]; } sampleReads[counts.Samples[iSample]] = itotal; } } foreach (var sample in counts.Samples) { if (!sampleReads.ContainsKey(sample)) { throw new Exception(string.Format("No sample {0} found at sample reads file {1}", sample, options.SampleReadsFile)); } } foreach (var geneValues in counts.GeneValues) { if (!geneLengthMap.ContainsKey(geneValues[0])) { throw new Exception(string.Format("No gene {0} found at gene length file {1}", geneValues[0], options.GeneLengthFile)); } } sampleCounts = (from sample in counts.Samples select sampleReads[sample]).ToArray(); geneLengths = (from geneValues in counts.GeneValues select geneLengthMap[geneValues[0]]).ToArray(); for (int iGene = 0; iGene < geneLengths.Length; iGene++) { for (int iSample = 0; iSample < sampleCounts.Length; iSample++) { counts.Count[iGene, iSample] = counts.Count[iGene, iSample] * 1000000000 / (geneLengths[iGene] * sampleCounts[iSample]); } } return counts; }
public override IEnumerable <string> Process() { var countFiles = options.GetCountFiles(); countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name)); var countMap = new Dictionary <string, Dictionary <string, int> >(); int fileIndex = 0; foreach (var file in countFiles) { fileIndex++; Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File); var queries = new HashSet <string>(); using (var sr = SAMFactory.GetReader(file.File, true)) { int count = 0; string line; while ((line = sr.ReadLine()) != null) { count++; if (count % 1000 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } var parts = line.Split('\t'); SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } queries.Add(parts[SAMFormatConst.QNAME_INDEX]); } } var countDic = new Dictionary <string, int>(); countMap[file.Name] = countDic; var cm = new MapItemReader(0, 1, informationIndex: 2).ReadFromFile(file.AdditionalFile); foreach (var query in queries) { var count = cm[query]; countDic[count.Information] = int.Parse(count.Value); } Progress.SetMessage("{0} reads mapped.", queries.Count); } var uniques = (from c in countMap.Values from seq in c.Keys select seq).Distinct().ToArray(); var uniqueCounts = (from seq in uniques let totalCount = (from c in countMap.Values where c.ContainsKey(seq) select c[seq]).Sum() select new { Sequence = seq, Count = totalCount }).OrderByDescending(m => m.Count).ToArray(); using (var sw = new StreamWriter(options.OutputFile)) { sw.WriteLine("Sequence\t" + (from cf in countFiles select cf.Name).Merge("\t")); foreach (var uc in uniqueCounts) { var seq = uc.Sequence; sw.Write(seq); foreach (var cf in countFiles) { var map = countMap[cf.Name]; int count; if (map.TryGetValue(seq, out count)) { sw.Write("\t{0}", count); } else { sw.Write("\t0"); } } sw.WriteLine(); } } Progress.End(); return(new string[] { Path.GetFullPath(options.OutputFile) }); }
public override IEnumerable <string> Process() { var result = new List <string>(); var countFiles = options.GetCountFiles(); countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name)); var format = new ChromosomeCountSlimItemXmlFormat(outputSample: true); var countMap = new Dictionary <string, ChromosomeCountSlimItem>(); int fileIndex = 0; foreach (var file in countFiles) { fileIndex++; Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File); var curcounts = format.ReadFromFile(file.File); if (curcounts.Count > 0 && string.IsNullOrEmpty(curcounts[0].Queries[0].Sequence)) { Console.WriteLine("Didn't read in the sequence of query " + curcounts[0].Queries[0].Qname); } curcounts.ForEach(m => { foreach (var q in m.Queries) { q.Sample = file.Name; } }); foreach (var c in curcounts) { var name = c.Names.First(); ChromosomeCountSlimItem item; if (countMap.TryGetValue(name, out item)) { item.Queries.AddRange(c.Queries); } else { countMap[name] = c; } } } var counts = countMap.Values.ToList(); WriteOutput(options.OutputFile, countFiles, format, counts); result.Add(options.OutputFile); if (File.Exists(options.CategoryMapFile)) { Progress.SetMessage("Reading category map ..."); var categoryMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile); var queries = new HashSet <SAMChromosomeItem>(from c in counts from q in c.Queries select q); var dic = new Dictionary <string, ChromosomeCountSlimItem>(); foreach (var q in queries) { q.Chromosomes = (from chrom in q.Chromosomes select categoryMap[chrom].Value).Distinct().OrderBy(m => m).ToList(); foreach (var chrom in q.Chromosomes) { ChromosomeCountSlimItem item; if (!dic.TryGetValue(chrom, out item)) { item = new ChromosomeCountSlimItem(); item.Names.Add(chrom); dic[chrom] = item; } item.Queries.Add(q); } } var catFile = Path.ChangeExtension(options.OutputFile, ".category" + Path.GetExtension(options.OutputFile)); WriteOutput(catFile, countFiles, format, dic.Values.ToList()); result.Add(catFile); } if (options.OutputReadTable || options.OutputReadContigTable) { Progress.SetMessage("Building sequence map..."); var reads = SmallRNASequenceUtils.ConvertFrom(counts); if (options.OutputReadTable) { Progress.SetMessage("Saving read file..."); var readOutput = Path.ChangeExtension(options.OutputFile, ".read" + Path.GetExtension(options.OutputFile)); new SmallRNASequenceFormat(int.MaxValue, false).WriteToFile(readOutput, reads); result.Add(readOutput); } if (options.OutputReadContigTable) { Progress.SetMessage("Building sequence contig by similarity ..."); var contigs = SmallRNASequenceUtils.BuildContigByIdenticalSimilarity(reads, options.MinimumOverlapRate, options.MaximumExtensionBase, progress: Progress); Progress.SetMessage("Contig number = {0}", contigs.Count); Progress.SetMessage("Saving contig file..."); var contigOutput = Path.ChangeExtension(options.OutputFile, ".contig" + Path.GetExtension(options.OutputFile)); new SmallRNASequenceContigFormat().WriteToFile(contigOutput, contigs); result.Add(contigOutput); Progress.SetMessage("Saving sequence contig details..."); new SmallRNASequenceContigDetailFormat().WriteToFile(contigOutput + ".details", contigs); result.Add(contigOutput + ".details"); } } Progress.End(); return(result); }