public override IEnumerable<string> Process(string useless) { var result = new MappedMirnaGroupXmlFileFormat().ReadFromFile(options.InputFile); using (StreamWriter sw = new StreamWriter(options.OutputFile)) { sw.WriteLine("miRNA\tLocation\tTotalCount\tPerfectMatch\tMiss5_2\tMiss3_3\tMissInternal"); foreach (var res in result) { var items = res.GetAlignedLocations(); if(res.DisplayName.Equals("hsa-mir-486-5p:TCCTGTACTGAGCTGCCCCGAG")){ items.ForEach(m => Console.WriteLine(m.Parent.Qname + "\t" + m.Strand + "\t" + m.MismatchPositions)); } var pmcount = items.Count(m => m.NumberOfMismatch == 0); var mis5 = items.Count(m => { SAMAlignedLocation loc = m; if (loc.NumberOfMismatch == 0) { return false; } var mp = loc.MismatchPositions; if (loc.Strand == '-') { mp = new string(mp.Reverse().ToArray()); } return reg5.Match(mp).Success; }); var mis3 = items.Count(m => { var loc = m; if (loc.NumberOfMismatch == 0) { return false; } var mp = loc.MismatchPositions; if (loc.Strand == '+') { mp = new string(mp.Reverse().ToArray()); } return reg3.Match(mp).Success; }); sw.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}", res.DisplayName, res.DisplayLocation, items.Count, pmcount, mis5, mis3, items.Count - pmcount - mis5 - mis3); } } return new string[] { options.OutputFile }; }
public override IEnumerable<string> Process() { var countfiles = options.GetCountFiles(); var dic = new Dictionary<string, Dictionary<string, MappedMirnaGroup>>(); foreach (var file in countfiles) { Progress.SetMessage("Reading miRNA mapped file " + file.File + " ..."); var mirnas = new MappedMirnaGroupXmlFileFormat().ReadFromFile(file.File); dic[file.Name] = mirnas.ToDictionary(m => m.DisplayName); } var features = (from c in dic.Values from k in c.Keys select k).Distinct().OrderBy(m => m).ToList(); var names = dic.Keys.OrderBy(m => m).ToList(); using (StreamWriter sw = new StreamWriter(options.OutputFile)) using (StreamWriter swNTA = new StreamWriter(options.NTAFile)) using (StreamWriter swIso = new StreamWriter(options.IsomirFile)) using (StreamWriter swIsoNTA = new StreamWriter(options.IsomirNTAFile)) { sw.WriteLine("Feature\tLocation\tSequence\t{0}", names.Merge("\t")); swNTA.WriteLine("Feature\tLocation\tSequence\t{0}", names.Merge("\t")); swIso.WriteLine("Feature\tLocation\tSequence\t{0}", names.Merge("\t")); swIsoNTA.WriteLine("Feature\tLocation\tSequence\t{0}", names.Merge("\t")); foreach (var feature in features) { OutputCount(sw, dic, feature, names, MirnaConsts.NO_OFFSET, false, ""); OutputCount(swNTA, dic, feature, names, MirnaConsts.NO_OFFSET, true, ""); OutputCount(swIso, dic, feature, names, 0, false, "_+_0"); OutputCount(swIso, dic, feature, names, 1, false, "_+_1"); OutputCount(swIso, dic, feature, names, 2, false, "_+_2"); OutputCount(swIsoNTA, dic, feature, names, 0, true, "_+_0"); OutputCount(swIsoNTA, dic, feature, names, 1, true, "_+_1"); OutputCount(swIsoNTA, dic, feature, names, 2, true, "_+_2"); } } var result = new[] { options.OutputFile, options.IsomirFile, options.NTAFile, options.IsomirNTAFile }.ToList(); return result; }
public List<MappedMirnaGroup> Build(string countXmlFile) { var result = new MappedMirnaGroupXmlFileFormat().ReadFromFile(countXmlFile); foreach (var group in result) { foreach (var mirna in group) { foreach (var region in mirna.MappedRegions) { var positions = region.Mapped.Keys.ToList(); foreach (var position in positions) { var mapped = region.Mapped[position]; mapped.AlignedLocations.RemoveAll(q => { var snp = q.GetNotGsnapMismatch(q.Parent.Sequence); if (null == snp) { return true; } return !snp.IsMutation('T', 'C'); }); if (mapped.AlignedLocations.Count == 0) { region.Mapped.Remove(position); } } } mirna.MappedRegions.RemoveAll(l => l.Mapped.Count == 0); } group.RemoveAll(n => n.MappedRegions.Count == 0); } result.RemoveAll(m => m.Count == 0); return result; }
public override IEnumerable <string> Process() { var entries = (from line in File.ReadAllLines(options.InputFile) let parts = line.Split('\t') where parts.Length >= 3 let mirna = parts.Length == 3 ? string.Empty : parts[3] select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2], MiRNAFile = mirna }).ToList(); if (entries.All(m => !File.Exists(m.MiRNAFile))) { return(new SmallRNACategoryGroupPlusBuilder(options) { Progress = this.Progress }.Process()); } var groups = entries.GroupBy(m => m.GroupName).ToList(); var result = new List <string>(); foreach (var group in groups) { var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount"); result.Add(catfile); using (var sw = new StreamWriter(catfile)) { sw.WriteLine("SampleName\tCategory\tLevel\tCount"); foreach (var entry in group) { Progress.SetMessage("Reading smallRNA mapped file " + entry.SmallRNAFile + " ..."); var others = new MappedItemGroupXmlFileFormat().ReadFromFile(entry.SmallRNAFile); var otherQueries = (from g in others from m in g from mr in m.MappedRegions from loc in mr.AlignedLocations select new QueryRecord(loc.Parent.Qname, m.Name.StringBefore(":"), m.Name.StringAfter(":").StringBefore(":"), m.Name.StringAfter(":").StringAfter(":"), loc.Parent.QueryCount)).ToGroupDictionary(m => m.Query); Progress.SetMessage("Reading smallRNA mapped file finished, {0} queries mapped.", otherQueries.Count); //2570-KCV-01-19.bam.count.mapped.xml => 2570-KCV-01-19.bam.info var infofile = Path.Combine(Path.GetDirectoryName(entry.SmallRNAFile), Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(entry.SmallRNAFile))) + ".info"); if (File.Exists(entry.MiRNAFile)) { infofile = Path.Combine(Path.GetDirectoryName(entry.MiRNAFile), Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(entry.MiRNAFile))) + ".info"); Progress.SetMessage("Reading miRNA mapped file " + entry.MiRNAFile + " ..."); var mirnas = new MappedMirnaGroupXmlFileFormat().ReadFromFile(entry.MiRNAFile); var mirnaQueries = (from g in mirnas from m in g from mr in m.MappedRegions from mapped in mr.Mapped.Values from loc in mapped.AlignedLocations select new QueryRecord(loc.Parent.Qname.StringBefore(":CLIP_"), "miRNA", "miRNA", m.Name, loc.Parent.QueryCount)).ToGroupDictionary(m => m.Query); Progress.SetMessage("Reading miRNA mapped file finished, {0} queries mapped.", mirnaQueries.Count); foreach (var q in mirnaQueries) { List <QueryRecord> rec; if (!otherQueries.TryGetValue(q.Key, out rec)) { rec = q.Value; otherQueries[q.Key] = q.Value; } else { rec.AddRange(q.Value); } } Progress.SetMessage("Total {0} queries mapped.", otherQueries.Count); } var counts = new List <CategoryCount>(); FillCounts(counts, options.Categories, otherQueries); var othercategories = (from v in otherQueries.Values from item in v select item.Biotype).Distinct().OrderBy(m => m).ToList(); FillCounts(counts, othercategories, otherQueries); if (File.Exists(infofile)) { var lines = File.ReadAllLines(infofile); Progress.SetMessage("reading mapping information from " + infofile + " ..."); int totalReads = 0; int mappedReads = 0; foreach (var line in lines) { if (line.StartsWith("TotalReads")) { totalReads = int.Parse(line.StringAfter("\t")); } else if (line.StartsWith("MappedReads")) { mappedReads = int.Parse(line.StringAfter("\t")); } } var smallRNAReads = counts.Sum(m => m.Count); sw.WriteLine("{0}\tTotal Reads\t0\t{1}", entry.SampleName, totalReads); sw.WriteLine("{0}\tMapped Reads\t0\t{1}", entry.SampleName, mappedReads); sw.WriteLine("{0}\tsmall RNA\t0\t{1}", entry.SampleName, smallRNAReads); sw.WriteLine("{0}\tUnmapped\t1\t{1}", entry.SampleName, totalReads - mappedReads); sw.WriteLine("{0}\tOther Mapped\t1\t{1}", entry.SampleName, mappedReads - smallRNAReads); sw.WriteLine("{0}\tsmall RNA\t1\t{1}", entry.SampleName, smallRNAReads); } foreach (var rec in counts) { sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, rec.Biotype, 2, rec.Count); } } } var data = (from line in File.ReadAllLines(catfile).Skip(1) where !string.IsNullOrWhiteSpace(line) let parts = line.Split('\t') let level = double.Parse(parts[2]) where !(parts[1].Equals("small RNA") && level == 1) select new { SampleName = parts[0], Category = parts[1], Level = level, Count = int.Parse(parts[3]) }).ToList(); var tablefile = catfile + ".tsv"; result.Add(tablefile); using (var sw = new StreamWriter(tablefile)) { var samples = (from d in data select d.SampleName).Distinct().OrderBy(m => m).ToList(); sw.WriteLine("Category\t{0}", samples.Merge("\t")); var categories = (from d in data where d.Level == 2 select d.Category).Distinct().OrderBy(m => m).ToList(); categories.Insert(0, "small RNA"); categories.Insert(0, "Other Mapped"); categories.Insert(0, "Unmapped"); categories.Insert(0, "Mapped Reads"); categories.Insert(0, "Total Reads"); Console.WriteLine(categories.Merge("\n")); var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category); foreach (var cat in categories) { sw.WriteLine("{0}\t{1}", cat, (from sample in samples let dic = map[sample] select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t")); } } var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName; if (File.Exists(rfile)) { var targetrfile = catfile + ".r"; using (var sw = new StreamWriter(targetrfile)) { sw.WriteLine("catfile<-\"{0}\"", catfile); sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory); sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0"); string line = File.ReadAllText(rfile); using (var sr = new StreamReader(rfile)) { if (line.Contains("#predefine_end")) { while ((line = sr.ReadLine()) != null) { if (line.Contains("#predefine_end")) { break; } } } while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\""); } } return(result); }
public override IEnumerable<string> Process() { var entries = (from line in File.ReadAllLines(options.InputFile) let parts = line.Split('\t') where parts.Length >= 3 let mirna = parts.Length == 3 ? string.Empty : parts[3] select new { GroupName = parts[0], SampleName = parts[1], SmallRNAFile = parts[2], MiRNAFile = mirna }).ToList(); if (entries.All(m => !File.Exists(m.MiRNAFile))) { return new SmallRNACategoryGroupPlusBuilder(options) { Progress = this.Progress }.Process(); } var groups = entries.GroupBy(m => m.GroupName).ToList(); var result = new List<string>(); foreach (var group in groups) { var catfile = Path.Combine(options.OutputDirectory, group.Key + ".catcount"); result.Add(catfile); using (var sw = new StreamWriter(catfile)) { sw.WriteLine("SampleName\tCategory\tLevel\tCount"); foreach (var entry in group) { Progress.SetMessage("Reading smallRNA mapped file " + entry.SmallRNAFile + " ..."); var others = new MappedItemGroupXmlFileFormat().ReadFromFile(entry.SmallRNAFile); var otherQueries = (from g in others from m in g from mr in m.MappedRegions from loc in mr.AlignedLocations select new QueryRecord(loc.Parent.Qname, m.Name.StringBefore(":"), m.Name.StringAfter(":").StringBefore(":"), m.Name.StringAfter(":").StringAfter(":"), loc.Parent.QueryCount)).ToGroupDictionary(m => m.Query); Progress.SetMessage("Reading smallRNA mapped file finished, {0} queries mapped.", otherQueries.Count); //2570-KCV-01-19.bam.count.mapped.xml => 2570-KCV-01-19.bam.info var infofile = Path.Combine(Path.GetDirectoryName(entry.SmallRNAFile), Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(entry.SmallRNAFile))) + ".info"); if (File.Exists(entry.MiRNAFile)) { infofile = Path.Combine(Path.GetDirectoryName(entry.MiRNAFile), Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(Path.GetFileNameWithoutExtension(entry.MiRNAFile))) + ".info"); Progress.SetMessage("Reading miRNA mapped file " + entry.MiRNAFile + " ..."); var mirnas = new MappedMirnaGroupXmlFileFormat().ReadFromFile(entry.MiRNAFile); var mirnaQueries = (from g in mirnas from m in g from mr in m.MappedRegions from mapped in mr.Mapped.Values from loc in mapped.AlignedLocations select new QueryRecord(loc.Parent.Qname.StringBefore(":CLIP_"), "miRNA", "miRNA", m.Name, loc.Parent.QueryCount)).ToGroupDictionary(m => m.Query); Progress.SetMessage("Reading miRNA mapped file finished, {0} queries mapped.", mirnaQueries.Count); foreach (var q in mirnaQueries) { List<QueryRecord> rec; if (!otherQueries.TryGetValue(q.Key, out rec)) { rec = q.Value; otherQueries[q.Key] = q.Value; } else { rec.AddRange(q.Value); } } Progress.SetMessage("Total {0} queries mapped.", otherQueries.Count); } var counts = new List<CategoryCount>(); FillCounts(counts, options.Categories, otherQueries); var othercategories = (from v in otherQueries.Values from item in v select item.Biotype).Distinct().OrderBy(m => m).ToList(); FillCounts(counts, othercategories, otherQueries); if (File.Exists(infofile)) { var lines = File.ReadAllLines(infofile); Progress.SetMessage("reading mapping information from " + infofile + " ..."); int totalReads = 0; int mappedReads = 0; foreach (var line in lines) { if (line.StartsWith("TotalReads")) { totalReads = int.Parse(line.StringAfter("\t")); } else if (line.StartsWith("MappedReads")) { mappedReads = int.Parse(line.StringAfter("\t")); } } var smallRNAReads = counts.Sum(m => m.Count); sw.WriteLine("{0}\tTotal Reads\t0\t{1}", entry.SampleName, totalReads); sw.WriteLine("{0}\tMapped Reads\t0\t{1}", entry.SampleName, mappedReads); sw.WriteLine("{0}\tsmall RNA\t0\t{1}", entry.SampleName, smallRNAReads); sw.WriteLine("{0}\tUnmapped\t1\t{1}", entry.SampleName, totalReads - mappedReads); sw.WriteLine("{0}\tOther Mapped\t1\t{1}", entry.SampleName, mappedReads - smallRNAReads); sw.WriteLine("{0}\tsmall RNA\t1\t{1}", entry.SampleName, smallRNAReads); } foreach (var rec in counts) { sw.WriteLine("{0}\t{1}\t{2}\t{3}", entry.SampleName, rec.Biotype, 2, rec.Count); } } } var data = (from line in File.ReadAllLines(catfile).Skip(1) where !string.IsNullOrWhiteSpace(line) let parts = line.Split('\t') let level = double.Parse(parts[2]) where !(parts[1].Equals("small RNA") && level == 1) select new { SampleName = parts[0], Category = parts[1], Level = level, Count = int.Parse(parts[3]) }).ToList(); var tablefile = catfile + ".tsv"; result.Add(tablefile); using (var sw = new StreamWriter(tablefile)) { var samples = (from d in data select d.SampleName).Distinct().OrderBy(m => m).ToList(); sw.WriteLine("Category\t{0}", samples.Merge("\t")); var categories = (from d in data where d.Level == 2 select d.Category).Distinct().OrderBy(m => m).ToList(); categories.Insert(0, "small RNA"); categories.Insert(0, "Other Mapped"); categories.Insert(0, "Unmapped"); categories.Insert(0, "Mapped Reads"); categories.Insert(0, "Total Reads"); Console.WriteLine(categories.Merge("\n")); var map = data.ToDoubleDictionary(m => m.SampleName, m => m.Category); foreach (var cat in categories) { sw.WriteLine("{0}\t{1}", cat, (from sample in samples let dic = map[sample] select dic.ContainsKey(cat) ? dic[cat].Count.ToString() : "").Merge("\t")); } } var rfile = new FileInfo(FileUtils.GetTemplateDir() + "/smallrna_category_group.r").FullName; if (File.Exists(rfile)) { var targetrfile = catfile + ".r"; using (var sw = new StreamWriter(targetrfile)) { sw.WriteLine("catfile<-\"{0}\"", catfile); sw.WriteLine("outputdir<-\"{0}\"", options.OutputDirectory); sw.WriteLine("ispdf<-{0}", options.PdfGraph ? "1" : "0"); string line = File.ReadAllText(rfile); using (var sr = new StreamReader(rfile)) { if (line.Contains("#predefine_end")) { while ((line = sr.ReadLine()) != null) { if (line.Contains("#predefine_end")) { break; } } } while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } SystemUtils.Execute("R", "--vanilla --slave -f \"" + targetrfile + "\""); } } return result; }
public override IEnumerable<string> Process() { var format = new MappedMirnaGroupXmlFileFormat(); Progress.SetMessage("reading mapped reads from " + options.ReferenceFile + " ..."); var refitems = format.ReadFromFile(options.ReferenceFile); var refSpecies = refitems[0][0].Name.StringBefore("-"); Progress.SetMessage("reading mapped reads from " + options.SampleFile + " ..."); var samitems = format.ReadFromFile(options.SampleFile); var samSpecies = samitems[0][0].Name.StringBefore("-"); var paired = GetPairedMiRNA(refitems, samitems); //using (StreamWriter sw = new StreamWriter(targetFile)) //{ // sw.WriteLine("microRNA\t{0}_sequence\t{1}_sequence\tbp_difference\tquery_sequence\t{0}_count\t{0}_estimate_count\t{1}_count\t{1}_estimate_count", refName, samName); // var keys = refitems.Keys.Union(samitems.Keys).Distinct().OrderBy(m => m).ToList(); // foreach (var key in keys) // { // sw.Write(key); // string refseq = string.Empty; // string samseq = string.Empty; // MappedMiRNA refmirna = null; // MappedMiRNA sammirna = null; // Dictionary<string, List<QueryMapped>> refmirnamap = new Dictionary<string, List<QueryMapped>>(); // Dictionary<string, List<QueryMapped>> sammirnamap = new Dictionary<string, List<QueryMapped>>(); // if (refitems.ContainsKey(key)) // { // refmirna = refitems[key]; // refseq = refmirna.Name.Contains(":") ? refmirna.Name.StringAfter(":") : string.Empty; // refmirnamap = ConvertToMap(refmirna); // } // if (samitems.ContainsKey(key)) // { // sammirna = samitems[key]; // samseq = sammirna.Name.Contains(":") ? sammirna.Name.StringAfter(":") : string.Empty; // sammirnamap = ConvertToMap(sammirna); // } // var seqs = refmirnamap.Keys.Union(sammirnamap.Keys).OrderBy(m => m).ToList(); // CombinedSequence cs = null; // if (!string.IsNullOrEmpty(refseq) && !string.IsNullOrEmpty(samseq)) // { // cs = MirnaUtils.GetCombinedSequence(refseq, samseq); // sw.Write("\t{0}\t{1}\t{2}", cs.GetAnnotatedSequence1(), cs.GetAnnotatedSequence2(), cs.MismatchPositions.Length); // } // else // { // sw.Write("\t{0}\t{1}\t-", refseq, samseq); // } // sw.WriteLine(); // foreach (var seq in seqs) // { // sw.Write("\t\t\t\t{0}", seq); // if (refmirnamap.ContainsKey(seq)) // { // sw.Write("\t{0:0.00}", refmirnamap[seq].Sum(m => m.QueryCount)); // sw.Write("\t{0:0.00}", refmirnamap[seq].Sum(m => m.EsminatedCount)); // } // else // { // sw.Write("\t\t"); // } // if (sammirnamap.ContainsKey(seq)) // { // sw.Write("\t{0:0.00}", sammirnamap[seq].Sum(m => m.QueryCount)); // sw.Write("\t{0:0.00}", sammirnamap[seq].Sum(m => m.EsminatedCount)); // } // else // { // sw.Write("\t\t"); // } // sw.WriteLine(); // } // } //} return new string[] { options.OutputFile }; }