public override IEnumerable <string> Process(string fileName) { IFilter <FastqSequence> filter = options.GetFilter(); using (GzipTextReader gz1 = new GzipTextReader(options.Gzip, options.FastqFiles[0])) using (GzipTextReader gz2 = new GzipTextReader(options.Gzip, options.FastqFiles[1])) using (StreamWriter sw1 = new StreamWriter(options.OutputFiles[0])) using (StreamWriter sw2 = new StreamWriter(options.OutputFiles[1])) { FastqReader reader = new FastqReader(); FastqWriter writer = new FastqWriter(); var count = 0; while (true) { var q1 = reader.Parse(gz1.Reader); var q2 = reader.Parse(gz2.Reader); if (q1 == null || q2 == null) { break; } count++; if (count % 100000 == 0) { Progress.SetMessage("{0} reads", count); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (filter.Accept(q1) && filter.Accept(q2)) { writer.Write(sw1, q1); writer.Write(sw2, q2); } } } return(options.OutputFiles); }
public override IEnumerable<string> Process(string fileName) { IFilter<FastqSequence> filter = options.GetFilter(); using (GzipTextReader gz1 = new GzipTextReader(options.Gzip, options.FastqFiles[0])) using (GzipTextReader gz2 = new GzipTextReader(options.Gzip, options.FastqFiles[1])) using (StreamWriter sw1 = new StreamWriter(options.OutputFiles[0])) using (StreamWriter sw2 = new StreamWriter(options.OutputFiles[1])) { FastqReader reader = new FastqReader(); FastqWriter writer = new FastqWriter(); var count = 0; while (true) { var q1 = reader.Parse(gz1.Reader); var q2 = reader.Parse(gz2.Reader); if (q1 == null || q2 == null) { break; } count++; if (count % 100000 == 0) { Progress.SetMessage("{0} reads", count); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (filter.Accept(q1) && filter.Accept(q2)) { writer.Write(sw1, q1); writer.Write(sw2, q2); } } } return options.OutputFiles; }
public override IEnumerable <string> Process() { var result = new List <string>(); var fastqFile = options.OutputFile; if (!options.Gunzipped && !fastqFile.ToLower().EndsWith(".gz")) { fastqFile = fastqFile + ".gz"; } result.Add(fastqFile); Dictionary <string, FastqSequence> queries = new Dictionary <string, FastqSequence>(); Progress.SetMessage("Processing " + options.InputFile + " and writing to " + fastqFile + "..."); var parser = new FastqReader(); var writer = new FastqWriter(); var tmpFile = fastqFile + ".tmp"; int readcount = 0; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(tmpFile, !options.Gunzipped)) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } if (seq.SeqString.Length < options.MinimumReadLength) { continue; } FastqSequence count; if (queries.TryGetValue(seq.SeqString, out count)) { count.RepeatCount++; if (options.OutputScores) { count.RepeatScores.Add(seq.Score); } continue; } queries[seq.SeqString] = seq; if (options.OutputScores) { seq.RepeatScores.Add(seq.Score); } writer.Write(sw, seq); } } } Progress.End(); var countFile = Path.ChangeExtension(fastqFile, ".dupcount"); result.Add(countFile); Progress.SetMessage("sort queries ..."); var seqs = queries.Values.ToList(); seqs.Sort((m1, m2) => { var res = m2.RepeatCount.CompareTo(m1.RepeatCount); if (res == 0) { res = m1.SeqString.CompareTo(m2.SeqString); } return(res); }); Progress.SetMessage("writing duplicate count ..."); using (StreamWriter sw = new StreamWriter(countFile)) { sw.WriteLine("Query\tCount\tSequence"); foreach (var seq in seqs) { sw.WriteLine("{0}\t{1}\t{2}", seq.Name, seq.RepeatCount, seq.SeqString); } } if (File.Exists(fastqFile)) { File.Delete(fastqFile); } File.Move(tmpFile, fastqFile); if (options.OutputScores) { Progress.SetMessage("writing score ..."); var scoreFile = Path.ChangeExtension(fastqFile, ".scores"); result.Add(scoreFile); using (StreamWriter sw = new StreamWriter(scoreFile)) { sw.WriteLine("Query\tSequence\tPosition\tScores"); foreach (var seq in seqs) { sw.WriteLine("{0}\t{1}", seq.Name, seq.SeqString); for (int i = 0; i < seq.SeqString.Length; i++) { Dictionary <char, int> count = new Dictionary <char, int>(); foreach (var score in seq.RepeatScores) { int oldcount; if (count.TryGetValue(score[i], out oldcount)) { count[score[i]] = oldcount + 1; } else { count[score[i]] = 1; } } sw.Write("\t\t{0}\t", i + 1); var keys = (from c in count.Keys orderby c select c).ToList(); foreach (var key in keys) { sw.Write("{0}({1})", key, count[key]); } sw.WriteLine(); } } } } return(result); }
public override IEnumerable<string> Process() { var result = new List<string>(); var gzipped = options.OutputFile.ToLower().EndsWith(".gz"); result.Add(options.OutputFile); Dictionary<string, FastqSequence> queries = new Dictionary<string, FastqSequence>(); Progress.SetMessage("Processing " + options.InputFile + " and writing to " + options.OutputFile + "..."); var parser = new FastqReader(); var writer = new FastqWriter(); var map = options.GetCountMap(); StreamWriter swCount = null; if (map.HasCountFile) { var of = options.OutputFile + ".dupcount"; swCount = new StreamWriter(of); swCount.WriteLine("Query\tCount\tSequence"); } int readcount = 0; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(options.OutputFile, gzipped)) { FastqSequence seq; while((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } var name = seq.Name; var sequence = seq.SeqString; var score = seq.Score; var len = sequence.Length; var description = seq.Description; var count = map.GetCount(seq.Name); for (int i = 0; i < 4; i++) { var newlen = len - i; if (newlen < options.MinimumReadLength) { break; } string clipped; if (i == 0) { clipped = string.Empty; } else { clipped = sequence.Substring(newlen); } seq.SeqString = sequence.Substring(0, newlen); seq.Score = score.Substring(0, newlen); seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped); writer.Write(sw, seq); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } } } } } if (map.HasCountFile) { swCount.Close(); } Progress.End(); return result; }
public override IEnumerable<string> Process() { var result = new List<string>(); var fastqFile = options.OutputFile; if (!options.Gunzipped && !fastqFile.ToLower().EndsWith(".gz")) { fastqFile = fastqFile + ".gz"; } result.Add(fastqFile); Dictionary<string, FastqSequence> queries = new Dictionary<string, FastqSequence>(); Progress.SetMessage("Processing " + options.InputFile + " and writing to " + fastqFile + "..."); var parser = new FastqReader(); var writer = new FastqWriter(); var tmpFile = fastqFile + ".tmp"; int readcount = 0; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(tmpFile, !options.Gunzipped)) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } if (seq.SeqString.Length < options.MinimumReadLength) { continue; } FastqSequence count; if (queries.TryGetValue(seq.SeqString, out count)) { count.RepeatCount++; if (options.OutputScores) { count.RepeatScores.Add(seq.Score); } continue; } queries[seq.SeqString] = seq; if (options.OutputScores) { seq.RepeatScores.Add(seq.Score); } writer.Write(sw, seq); } } } Progress.End(); var countFile = Path.ChangeExtension(fastqFile, ".dupcount"); result.Add(countFile); Progress.SetMessage("sort queries ..."); var seqs = queries.Values.ToList(); seqs.Sort((m1, m2) => { var res = m2.RepeatCount.CompareTo(m1.RepeatCount); if (res == 0) { res = m1.SeqString.CompareTo(m2.SeqString); } return res; }); Progress.SetMessage("writing duplicate count ..."); using (StreamWriter sw = new StreamWriter(countFile)) { sw.WriteLine("Query\tCount\tSequence"); foreach (var seq in seqs) { sw.WriteLine("{0}\t{1}\t{2}", seq.Name, seq.RepeatCount, seq.SeqString); } } if (File.Exists(fastqFile)) { File.Delete(fastqFile); } File.Move(tmpFile, fastqFile); if (options.OutputScores) { Progress.SetMessage("writing score ..."); var scoreFile = Path.ChangeExtension(fastqFile, ".scores"); result.Add(scoreFile); using (StreamWriter sw = new StreamWriter(scoreFile)) { sw.WriteLine("Query\tSequence\tPosition\tScores"); foreach (var seq in seqs) { sw.WriteLine("{0}\t{1}", seq.Name, seq.SeqString); for (int i = 0; i < seq.SeqString.Length; i++) { Dictionary<char, int> count = new Dictionary<char, int>(); foreach (var score in seq.RepeatScores) { int oldcount; if (count.TryGetValue(score[i], out oldcount)) { count[score[i]] = oldcount + 1; } else { count[score[i]] = 1; } } sw.Write("\t\t{0}\t", i + 1); var keys = (from c in count.Keys orderby c select c).ToList(); foreach (var key in keys) { sw.Write("{0}({1})", key, count[key]); } sw.WriteLine(); } } } } return result; }
public override IEnumerable<string> Process() { var result = new List<string>(); Console.WriteLine("Read mapping file " + options.MappingFile + "..."); var lines = File.ReadAllLines(options.MappingFile).Where(m => !string.IsNullOrWhiteSpace(m)).ToList(); var map = (from l in lines let parts = (from p in l.Split('\t', ' ') let pp = p.Trim() where pp.Length > 0 select pp).ToArray() where parts.Length > 1 select new BarFile() { Barcode = parts[0], Filename = parts[1] }).ToList(); var dic = (from k in map select new BarFile() { Barcode = k.Barcode, Filename = Path.Combine(options.OutputDirectory, k.Filename), Stream = null, Count = 0 }).ToDictionary(m => m.Barcode); Console.WriteLine("There are " + dic.Count.ToString() + " indecies."); foreach (var barcode in dic.Keys.OrderBy(m => m)) { Console.WriteLine("{0}\t{1}", barcode, dic[barcode].Filename); } try { result.AddRange(from d in dic select d.Value.Filename); var parser = new FastqReader(); var writer = new FastqWriter(); var unfound = new Dictionary<string, int>(); int readcount = 0; var reg = new Regex(@".+:\s*(\S+?)\s*$"); Progress.SetMessage("Processing " + Path.GetFullPath(options.InputFile) + " ..."); using (var sr = StreamUtils.GetReader(options.InputFile)) { FastqSequence seq; while((seq = parser.Parse(sr)) != null) { //Console.WriteLine("seq = " + seq.Reference); readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } var m = reg.Match(seq.Reference); if (!m.Success) { throw new Exception("Cannot find index from " + seq.Reference); } var barcode = m.Groups[1].Value; //Console.WriteLine("barcode = " + barcode); BarFile file; if (dic.TryGetValue(barcode, out file)) { if (file.Stream == null) { file.Stream = StreamUtils.GetWriter(file.Filename, file.Filename.ToLower().EndsWith(".gz")); } if (!options.UntrimTerminalN) { while (seq.SeqString.Length > 0 && seq.SeqString.Last() == 'N') { seq.SeqString = seq.SeqString.Substring(0, seq.SeqString.Length - 1); seq.Score = seq.Score.Substring(0, seq.Score.Length - 1); } while (seq.SeqString.Length > 0 && seq.SeqString.First() == 'N') { seq.SeqString = seq.SeqString.Substring(1); seq.Score = seq.Score.Substring(1); } } writer.Write(file.Stream, seq); file.Count++; } else { int count; if (unfound.TryGetValue(barcode, out count)) { unfound[barcode] = count + 1; } else { unfound[barcode] = 1; //Console.WriteLine("Barcode " + barcode + " is not defined in map file, ignored."); } } } } using (var sw = new StreamWriter(Path.Combine(options.OutputDirectory, options.SummaryFile))) { sw.WriteLine("Type\tIndex\tCount"); foreach (var d in dic.Keys.OrderBy(m => m)) { sw.WriteLine("Sample\t{0}\t{1}", dic[d].Barcode, dic[d].Count); } foreach (var d in unfound.OrderByDescending(m => m.Value)) { sw.WriteLine("Unmapped\t{0}\t{1}", d.Key, d.Value); } } } finally { foreach (var d in dic) { if (null != d.Value.Stream) { d.Value.Stream.Close(); } } } Progress.End(); return result; }
private void DoProcess(Func<FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary<int, CountItem> dic) { Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "..."); var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value)); var parser = new FastqReader(); var writer = new FastqWriter(); StreamWriter swCount = null; if (map.HasCountFile) { swCount = new StreamWriter(outputFile + ".dupcount"); swCount.WriteLine("Query\tCount\tSequence"); } try { int readcount = 0; var tmpFile = outputFile + ".tmp"; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz"))) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } if (!accept(seq)) { continue; } var name = seq.Name; var sequence = seq.SeqString; var score = seq.Score; var len = sequence.Length; var description = seq.Description; var count = map.GetCount(seq.Name); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } CountItem item; if (!dic.TryGetValue(sequence.Length, out item)) { item = new CountItem(); dic[sequence.Length] = item; } string clipped; if (sequence.EndsWith("CCAA")) { clipped = "CCAA"; sequence = sequence.Substring(0, sequence.Length - 4); item.CCAA += count; } else if (sequence.EndsWith("CCA")) { clipped = "CCA"; sequence = sequence.Substring(0, sequence.Length - 3); item.CCA += count; } else if (sequence.EndsWith("CC")) { bool isCCA; if (ccaMap.TryGetValue(name, out isCCA) && isCCA) { clipped = "CC"; sequence = sequence.Substring(0, sequence.Length - 2); item.CC += count; } else { clipped = string.Empty; item.notNTA += count; } } else { clipped = string.Empty; item.notNTA += count; } if (!string.IsNullOrEmpty(clipped)) { var newlen = sequence.Length; seq.SeqString = sequence; seq.Score = score.Substring(0, newlen); seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped); } else { seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG); } writer.Write(sw, seq); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } } } } File.Move(tmpFile, outputFile); } finally { if (map.HasCountFile) { swCount.Close(); } } }
public int Extract(string sourceFile, string targetFile, IEnumerable <string> exceptQueryNames, string countFile) { int result = 0; var except = new HashSet <string>(exceptQueryNames); SmallRNACountMap cm = new SmallRNACountMap(); StreamWriter swCount = null; if (File.Exists(countFile)) { var oldCm = new SmallRNACountMap(countFile); foreach (var c in oldCm.Counts) { cm.Counts[c.Key.StringBefore(SmallRNAConsts.NTA_TAG)] = c.Value; } swCount = new StreamWriter(targetFile + ".dupcount"); } try { using (var sw = StreamUtils.GetWriter(targetFile, targetFile.ToLower().EndsWith(".gz"))) { using (var sr = StreamUtils.GetReader(sourceFile)) { FastqReader reader = new FastqReader(); FastqWriter writer = new FastqWriter(); FastqSequence ss; var count = 0; while ((ss = reader.Parse(sr)) != null) { count++; if (count % 100000 == 0) { Progress.SetMessage("{0} reads", count); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description; if (except.Contains(ss.Name)) { continue; } if (Filter != null && !Filter.Accept(ss)) { continue; } except.Add(ss.Name); writer.Write(sw, ss); if (swCount != null) { swCount.WriteLine("{0}\t{1}", ss.Name, cm.Counts[ss.Name]); } result++; } } } } finally { if (swCount != null) { swCount.Close(); } } return(result); }
public override IEnumerable<string> Process() { var result = new List<string>(); var except = new HashSet<string>(); if (File.Exists(options.XmlFile)) { //exclude the reads mapped to features no matter how many number of mismatch it has var allmapped = new FeatureItemGroupXmlFormat().ReadFromFile(options.XmlFile); except.UnionWith(from g in allmapped from f in g from l in f.Locations from sl in l.SamLocations select sl.SamLocation.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG)); } if (File.Exists(options.ExcludeFile)) { except.UnionWith(from l in File.ReadAllLines(options.ExcludeFile) select l.StringBefore(SmallRNAConsts.NTA_TAG)); } SmallRNACountMap cm = options.GetCountMap(); var keys = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray(); foreach (var key in keys) { cm.Counts[key.StringBefore(SmallRNAConsts.NTA_TAG)] = cm.Counts[key]; } StreamWriter swCount = null; if (File.Exists(options.CountFile)) { swCount = new StreamWriter(options.OutputFile + ".dupcount"); } Progress.SetMessage("output unmapped query..."); try { using (var sw = StreamUtils.GetWriter(options.OutputFile, options.OutputFile.ToLower().EndsWith(".gz"))) { using (var sr = StreamUtils.GetReader(options.InputFile)) { FastqReader reader = new FastqReader(); FastqWriter writer = new FastqWriter(); FastqSequence ss; var count = 0; while ((ss = reader.Parse(sr)) != null) { count++; if (count % 100000 == 0) { Progress.SetMessage("{0} reads", count); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description; if (except.Contains(ss.Name)) { continue; } if (Accept != null && !Accept(ss)) { continue; } except.Add(ss.Name); writer.Write(sw, ss); if (swCount != null) { int cmcount; if (!cm.Counts.TryGetValue(ss.Name, out cmcount)) { throw new Exception(string.Format("Cannot find {0} in count map", ss.Name)); } swCount.WriteLine("{0}\t{1}", ss.Name, cmcount); } } } } } finally { if (swCount != null) { swCount.Close(); } } Progress.End(); return result; }
public int Extract(string sourceFile, string targetFile, IEnumerable<string> exceptQueryNames, string countFile) { int result = 0; var except = new HashSet<string>(exceptQueryNames); SmallRNACountMap cm = new SmallRNACountMap(); StreamWriter swCount = null; if (File.Exists(countFile)) { var oldCm = new SmallRNACountMap(countFile); foreach (var c in oldCm.Counts) { cm.Counts[c.Key.StringBefore(SmallRNAConsts.NTA_TAG)] = c.Value; } swCount = new StreamWriter(targetFile + ".dupcount"); } try { using (var sw = StreamUtils.GetWriter(targetFile, targetFile.ToLower().EndsWith(".gz"))) { using (var sr = StreamUtils.GetReader(sourceFile)) { FastqReader reader = new FastqReader(); FastqWriter writer = new FastqWriter(); FastqSequence ss; var count = 0; while ((ss = reader.Parse(sr)) != null) { count++; if (count % 100000 == 0) { Progress.SetMessage("{0} reads", count); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description; if (except.Contains(ss.Name)) { continue; } if (Filter != null && !Filter.Accept(ss)) { continue; } except.Add(ss.Name); writer.Write(sw, ss); if (swCount != null) { swCount.WriteLine("{0}\t{1}", ss.Name, cm.Counts[ss.Name]); } result++; } } } } finally { if (swCount != null) { swCount.Close(); } } return result; }
public override IEnumerable <string> Process() { var result = new List <string>(); Console.WriteLine("Read mapping file " + options.MappingFile + "..."); var lines = File.ReadAllLines(options.MappingFile).Where(m => !string.IsNullOrWhiteSpace(m)).ToList(); var map = (from l in lines let parts = (from p in l.Split('\t', ' ') let pp = p.Trim() where pp.Length > 0 select pp).ToArray() where parts.Length > 1 select new BarFile() { Barcode = parts[0], Filename = parts[1] }).ToList(); var dic = (from k in map select new BarFile() { Barcode = k.Barcode, Filename = Path.Combine(options.OutputDirectory, k.Filename), Stream = null, Count = 0 }).ToDictionary(m => m.Barcode); Console.WriteLine("There are " + dic.Count.ToString() + " indecies."); foreach (var barcode in dic.Keys.OrderBy(m => m)) { Console.WriteLine("{0}\t{1}", barcode, dic[barcode].Filename); } try { result.AddRange(from d in dic select d.Value.Filename); var parser = new FastqReader(); var writer = new FastqWriter(); var unfound = new Dictionary <string, int>(); int readcount = 0; var reg = new Regex(@".+:\s*(\S+?)\s*$"); Progress.SetMessage("Processing " + Path.GetFullPath(options.InputFile) + " ..."); using (var sr = StreamUtils.GetReader(options.InputFile)) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { //Console.WriteLine("seq = " + seq.Reference); readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } var m = reg.Match(seq.Reference); if (!m.Success) { throw new Exception("Cannot find index from " + seq.Reference); } var barcode = m.Groups[1].Value; //Console.WriteLine("barcode = " + barcode); BarFile file; if (dic.TryGetValue(barcode, out file)) { if (file.Stream == null) { file.Stream = StreamUtils.GetWriter(file.Filename, file.Filename.ToLower().EndsWith(".gz")); } if (!options.UntrimTerminalN) { while (seq.SeqString.Length > 0 && seq.SeqString.Last() == 'N') { seq.SeqString = seq.SeqString.Substring(0, seq.SeqString.Length - 1); seq.Score = seq.Score.Substring(0, seq.Score.Length - 1); } while (seq.SeqString.Length > 0 && seq.SeqString.First() == 'N') { seq.SeqString = seq.SeqString.Substring(1); seq.Score = seq.Score.Substring(1); } } writer.Write(file.Stream, seq); file.Count++; } else { int count; if (unfound.TryGetValue(barcode, out count)) { unfound[barcode] = count + 1; } else { unfound[barcode] = 1; //Console.WriteLine("Barcode " + barcode + " is not defined in map file, ignored."); } } } } using (var sw = new StreamWriter(Path.Combine(options.OutputDirectory, options.SummaryFile))) { sw.WriteLine("Type\tIndex\tCount"); foreach (var d in dic.Keys.OrderBy(m => m)) { sw.WriteLine("Sample\t{0}\t{1}", dic[d].Barcode, dic[d].Count); } foreach (var d in unfound.OrderByDescending(m => m.Value)) { sw.WriteLine("Unmapped\t{0}\t{1}", d.Key, d.Value); } } } finally { foreach (var d in dic) { if (null != d.Value.Stream) { d.Value.Stream.Close(); } } } Progress.End(); return(result); }
public override IEnumerable<string> Process() { var result = new List<string>(); result.AddRange(options.OutputFiles); Progress.SetMessage("Processing " + options.InputFiles.Merge(",") + " and writing to " + options.OutputFiles.Merge(",") + "..."); var writer = new FastqWriter(); int readcount = 0; var srs = new List<StreamReader>(); var sws = new List<StreamWriter>(); var parsers = new FastqReader(); try { Progress.SetMessage("Opening input files ..."); srs.AddRange(from input in options.InputFiles select StreamUtils.GetReader(input)); Progress.SetMessage("Opening output files ..."); sws.AddRange(from output in options.OutputFiles select StreamUtils.GetWriter(output, output.ToLower().EndsWith(".gz") || options.Gzipped)); Progress.SetMessage("Reading sequences ..."); while (true) { var seqs = (from sr in srs select parsers.Parse(sr)).ToArray(); if (seqs.All(m => m == null)) { break; } if (seqs.Any(m => m == null)) { throw new Exception("The data is not properly paired :" + (from s in seqs where s != null select s.Name).Merge(" ! ")); } if (seqs.Length > 1) { var names = (from seq in seqs select seq.Name.StringBefore(" ").StringBefore("/1").StringBefore("/2")).ToArray(); if (names.Any(m => !m.Equals(names[0]))) { throw new Exception("The data is not properly paired: " + names.Merge(" ! ")); } } readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } if (options.Last > 0) { seqs.ForEach(seq => { seq.SeqString = seq.SeqString.Substring(0, options.Last); seq.Score = seq.Score.Substring(0, options.Last); }); } if (options.Start > 1) { seqs.ForEach(seq => { seq.SeqString = seq.SeqString.Substring(options.Start - 1); seq.Score = seq.Score.Substring(options.Start - 1); }); } if (options.TrimN) { seqs.ForEach(seq => { while (seq.SeqString.StartsWith("N")) { seq.SeqString = seq.SeqString.Substring(1); seq.Score = seq.Score.Substring(1); } while (seq.SeqString.EndsWith("N")) { seq.SeqString = seq.SeqString.Substring(0, seq.SeqString.Length - 1); seq.Score = seq.Score.Substring(0, seq.Score.Length - 1); } }); } if (options.MinimumLength > 0 && seqs.Any(m => m.SeqString.Length < options.MinimumLength)) { continue; } for (int i = 0; i < seqs.Length; i++) { writer.Write(sws[i], seqs[i]); } } } finally { srs.ForEach(m => m.Close()); sws.ForEach(m => m.Close()); } Progress.End(); return result; }
public override IEnumerable<string> Process() { var parser = new FastqReader(); Progress.SetMessage("Reading " + options.InputFile + "..."); var ccs = new Dictionary<string, string>(); using (var sr = StreamUtils.GetReader(options.InputFile)) { FastqSequence seq; int readcount = 0; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} / {1} reads end with CC found", ccs.Count, readcount); } if (seq.SeqString.EndsWith("CC")) { ccs[seq.Name] = seq.SeqString; } } } Dictionary<string, FastqSequence> queries = new Dictionary<string, FastqSequence>(); Progress.SetMessage("Processing " + options.UntrimmedFile + " and writing to " + options.OutputFile + "..."); var writer = new FastqWriter(); using (var sr = StreamUtils.GetReader(options.UntrimmedFile)) { using (var sw = new StreamWriter(options.OutputFile)) { sw.WriteLine("Name\tIsCCA"); FastqSequence seq; int readcount = 0; parser.AcceptName = m => ccs.ContainsKey(m); while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 10000 == 0) { Progress.SetMessage("{0} reads end with CC processed", readcount); } string sequence = ccs[seq.Name]; var pos = seq.SeqString.IndexOf(sequence); if (pos == -1) { throw new Exception(string.Format("Cannot find trimmed sequence {0} in untrimmed sequence {1} of read {2}", sequence, seq.SeqString, seq.Name)); } var nextseq = seq.SeqString.Substring(pos + sequence.Length); sw.WriteLine("{0}\t{1}", seq.Name, nextseq.StartsWith("A")); ccs.Remove(seq.Name); } } } if (ccs.Count != 0) { var unfoundFile = options.OutputFile + ".unfound"; using (var sw = new StreamWriter(unfoundFile)) { ccs.ForEach(m => sw.WriteLine(m.Key)); } throw new Exception(string.Format("Cannot find {0} reads in untrimmed file, saved to {1}", ccs.Count, unfoundFile)); } Progress.End(); return new[] { options.OutputFile }; }