public override IEnumerable <string> Process() { var result = new List <string>(); var except = new HashSet <string>(); if (File.Exists(options.XmlFile)) { //exclude the reads mapped to features no matter how many number of mismatch it has var allmapped = new FeatureItemGroupXmlFormat().ReadFromFile(options.XmlFile); except.UnionWith(from g in allmapped from f in g from l in f.Locations from sl in l.SamLocations select sl.SamLocation.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG)); } if (File.Exists(options.ExcludeFile)) { except.UnionWith(from l in File.ReadAllLines(options.ExcludeFile) select l.StringBefore(SmallRNAConsts.NTA_TAG)); } SmallRNACountMap cm = options.GetCountMap(); var keys = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray(); foreach (var key in keys) { cm.Counts[key.StringBefore(SmallRNAConsts.NTA_TAG)] = cm.Counts[key]; } StreamWriter swCount = null; if (File.Exists(options.CountFile)) { swCount = new StreamWriter(options.OutputFile + ".dupcount"); } Progress.SetMessage("output unmapped query..."); try { using (var sw = StreamUtils.GetWriter(options.OutputFile, options.OutputFile.ToLower().EndsWith(".gz"))) { using (var sr = StreamUtils.GetReader(options.InputFile)) { FastqReader reader = new FastqReader(); FastqWriter writer = new FastqWriter(); FastqSequence ss; var count = 0; while ((ss = reader.Parse(sr)) != null) { count++; if (count % 100000 == 0) { Progress.SetMessage("{0} reads", count); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description; if (except.Contains(ss.Name)) { continue; } if (Accept != null && !Accept(ss)) { continue; } except.Add(ss.Name); writer.Write(sw, ss); if (swCount != null) { int cmcount; if (!cm.Counts.TryGetValue(ss.Name, out cmcount)) { throw new Exception(string.Format("Cannot find {0} in count map", ss.Name)); } swCount.WriteLine("{0}\t{1}", ss.Name, cmcount); } } } } } finally { if (swCount != null) { swCount.Close(); } } Progress.End(); return(result); }
private void DoProcess(Func <FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary <int, CountItem> dic) { Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "..."); var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value)); var parser = new FastqReader(); var writer = new FastqWriter(); StreamWriter swCount = null; if (map.HasCountFile) { swCount = new StreamWriter(outputFile + ".dupcount"); swCount.WriteLine("Query\tCount\tSequence"); } try { int readcount = 0; var tmpFile = outputFile + ".tmp"; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz"))) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } if (!accept(seq)) { continue; } var name = seq.Name; var sequence = seq.SeqString; var score = seq.Score; var len = sequence.Length; var description = seq.Description; var count = map.GetCount(seq.Name); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } CountItem item; if (!dic.TryGetValue(sequence.Length, out item)) { item = new CountItem(); dic[sequence.Length] = item; } string clipped; if (sequence.EndsWith("CCAA")) { clipped = "CCAA"; sequence = sequence.Substring(0, sequence.Length - 4); item.CCAA += count; } else if (sequence.EndsWith("CCA")) { clipped = "CCA"; sequence = sequence.Substring(0, sequence.Length - 3); item.CCA += count; } else if (sequence.EndsWith("CC")) { bool isCCA; if (ccaMap.TryGetValue(name, out isCCA) && isCCA) { clipped = "CC"; sequence = sequence.Substring(0, sequence.Length - 2); item.CC += count; } else { clipped = string.Empty; item.notNTA += count; } } else { clipped = string.Empty; item.notNTA += count; } if (!string.IsNullOrEmpty(clipped)) { var newlen = sequence.Length; seq.SeqString = sequence; seq.Score = score.Substring(0, newlen); seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped); } else { seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG); } writer.Write(sw, seq); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } } } } File.Move(tmpFile, outputFile); } finally { if (map.HasCountFile) { swCount.Close(); } } }
public override IEnumerable <string> Process() { var result = new List <string>(); var gzipped = options.OutputFile.ToLower().EndsWith(".gz"); result.Add(options.OutputFile); Dictionary <string, FastqSequence> queries = new Dictionary <string, FastqSequence>(); Progress.SetMessage("Processing " + options.InputFile + " and writing to " + options.OutputFile + "..."); var parser = new FastqReader(); var writer = new FastqWriter(); var map = options.GetCountMap(); StreamWriter swCount = null; if (map.HasCountFile) { var of = options.OutputFile + ".dupcount"; swCount = new StreamWriter(of); swCount.WriteLine("Query\tCount\tSequence"); } int readcount = 0; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(options.OutputFile, gzipped)) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } var name = seq.Name; var sequence = seq.SeqString; var score = seq.Score; var len = sequence.Length; var description = seq.Description; var count = map.GetCount(seq.Name); for (int i = 0; i < 4; i++) { var newlen = len - i; if (newlen < options.MinimumReadLength) { break; } string clipped; if (i == 0) { clipped = string.Empty; } else { clipped = sequence.Substring(newlen); } seq.SeqString = sequence.Substring(0, newlen); seq.Score = score.Substring(0, newlen); seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped); writer.Write(sw, seq); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } } } } } if (map.HasCountFile) { swCount.Close(); } Progress.End(); return(result); }