public virtual SmallRNACountMap GetCountMap() { if (cm == null) { cm = new SmallRNACountMap(this.CountFile); } return cm; }
public virtual SmallRNACountMap GetCountMap() { if (cm == null) { cm = new SmallRNACountMap(this.CountFile); } return(cm); }
public override SmallRNACountMap GetCountMap() { if (cm == null) { cm = new SmallRNACountMap(this.CountFile); var keys = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray(); foreach (var key in keys) { var purekey = key.StringBefore(SmallRNAConsts.NTA_TAG); cm.Counts[purekey] = cm.Counts[key]; } } return(cm); }
private void DoProcess(Func <FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary <int, CountItem> dic) { Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "..."); var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value)); var parser = new FastqReader(); var writer = new FastqWriter(); StreamWriter swCount = null; if (map.HasCountFile) { swCount = new StreamWriter(outputFile + ".dupcount"); swCount.WriteLine("Query\tCount\tSequence"); } try { int readcount = 0; var tmpFile = outputFile + ".tmp"; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz"))) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } if (!accept(seq)) { continue; } var name = seq.Name; var sequence = seq.SeqString; var score = seq.Score; var len = sequence.Length; var description = seq.Description; var count = map.GetCount(seq.Name); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } CountItem item; if (!dic.TryGetValue(sequence.Length, out item)) { item = new CountItem(); dic[sequence.Length] = item; } string clipped; if (sequence.EndsWith("CCAA")) { clipped = "CCAA"; sequence = sequence.Substring(0, sequence.Length - 4); item.CCAA += count; } else if (sequence.EndsWith("CCA")) { clipped = "CCA"; sequence = sequence.Substring(0, sequence.Length - 3); item.CCA += count; } else if (sequence.EndsWith("CC")) { bool isCCA; if (ccaMap.TryGetValue(name, out isCCA) && isCCA) { clipped = "CC"; sequence = sequence.Substring(0, sequence.Length - 2); item.CC += count; } else { clipped = string.Empty; item.notNTA += count; } } else { clipped = string.Empty; item.notNTA += count; } if (!string.IsNullOrEmpty(clipped)) { var newlen = sequence.Length; seq.SeqString = sequence; seq.Score = score.Substring(0, newlen); seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped); } else { seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG); } writer.Write(sw, seq); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } } } } File.Move(tmpFile, outputFile); } finally { if (map.HasCountFile) { swCount.Close(); } } }
public int Extract(string sourceFile, string targetFile, IEnumerable<string> exceptQueryNames, string countFile) { int result = 0; var except = new HashSet<string>(exceptQueryNames); SmallRNACountMap cm = new SmallRNACountMap(); StreamWriter swCount = null; if (File.Exists(countFile)) { var oldCm = new SmallRNACountMap(countFile); foreach (var c in oldCm.Counts) { cm.Counts[c.Key.StringBefore(SmallRNAConsts.NTA_TAG)] = c.Value; } swCount = new StreamWriter(targetFile + ".dupcount"); } try { using (var sw = StreamUtils.GetWriter(targetFile, targetFile.ToLower().EndsWith(".gz"))) { using (var sr = SAMFactory.GetReader(sourceFile, true)) { string line; var count = 0; while ((line = sr.ReadLine()) != null) { count++; if (count % 100000 == 0) { Progress.SetMessage("{0} reads", count); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } var ss = SAMUtils.Parse<SAMItemSlim>(line); ss.Qname = ss.Qname.StringBefore(SmallRNAConsts.NTA_TAG); if (except.Contains(ss.Qname)) { continue; } if (Filter != null && !Filter.Accept(ss)) { continue; } except.Add(ss.Qname); ss.WriteFastq(sw); if (swCount != null) { swCount.WriteLine("{0}\t{1}", ss.Qname, cm.Counts[ss.Qname]); } result++; } } } } finally { if (swCount != null) { swCount.Close(); } } return result; }
public override IEnumerable <string> Process() { var baminfofiles = Directory.GetFiles(options.RootDirectory, "*.bam.info", SearchOption.AllDirectories); var baminfogroup = baminfofiles.GroupBy(m => Path.GetDirectoryName(Path.GetDirectoryName(m))).ToList(); bool singleDirectory = false; string singleDirectoryPath = ""; var result = new List <string>(); if (baminfogroup.Count == 1) { Progress.SetMessage("Single directory mode"); singleDirectory = true; singleDirectoryPath = baminfogroup.First().Key; baminfogroup = baminfofiles.GroupBy(m => Path.GetDirectoryName(m)).ToList(); result.Add(singleDirectoryPath); } else { Progress.SetMessage("Multiple directories mode"); } Progress.SetMessage("Total {0} info files in {1} directory/directories found.", baminfofiles.Length, singleDirectory ? 1 : baminfogroup.Count); var files = new List <string>(); int count = 0; foreach (var group in baminfogroup) { count++; var file = group.First(); Progress.SetMessage("{0}/{1} : Checking {2} ...", count, baminfogroup.Count, file); var lines = File.ReadAllLines(file); var countfileline = lines.FirstOrDefault(m => m.StartsWith("#countFile")); if (string.IsNullOrWhiteSpace(countfileline)) { Progress.SetMessage(" not count file used, ignore."); continue; } var countfile = countfileline.StringAfter("\t"); if (!File.Exists(countfile)) { Progress.SetMessage(" count file {0} not exist, ignore.", countfile); continue; } var countIndex = lines.ToList().FindIndex(m => m.StartsWith("TotalReads")); var totalCountInInfoFile = int.Parse(lines[countIndex].StringAfter("\t")); var totalCountInCountFile = new SmallRNACountMap(countfile).GetTotalCount(); if (totalCountInInfoFile != totalCountInCountFile) { Progress.SetMessage(" Failed : {0} : {1} => {2}", file, totalCountInInfoFile, totalCountInCountFile); foreach (var f in group) { files.Add(f); } if (!singleDirectory) { result.Add(group.Key); } if (options.PerformUpdate && singleDirectory) { lines[countIndex] = "TotalReads\t" + totalCountInCountFile.ToString(); File.WriteAllLines(file, lines); } } } if (options.PerformUpdate) { if (!singleDirectory) { Progress.SetMessage("Updating {0} info files from {1} groups ...", files.Count, result.Count); count = 0; foreach (var file in files) { count++; Progress.SetMessage("{0}/{1}: updating {2} ...", count, files.Count, file); var lines = File.ReadAllLines(file); var countfileline = lines.FirstOrDefault(m => m.StartsWith("#countFile")); var countfile = countfileline.StringAfter("\t"); var countIndex = lines.ToList().FindIndex(m => m.StartsWith("TotalReads")); var totalCountInInfoFile = int.Parse(lines[countIndex].StringAfter("\t")); var totalCountInCountFile = new SmallRNACountMap(countfile).GetTotalCount(); if (totalCountInInfoFile != totalCountInCountFile) { lines[countIndex] = "TotalReads\t" + totalCountInCountFile.ToString(); File.WriteAllLines(file, lines); } } Progress.SetMessage("Please redo the category analysis which uses the information from following directoris :"); foreach (var dir in result) { Progress.SetMessage(" " + dir); } } else { Progress.SetMessage("Please redo the category analysis which uses the information from following directory :"); Progress.SetMessage(" " + singleDirectoryPath); } } else { if (files.Count > 0) { Progress.SetMessage("Total {0} info files from {1} groups need to be updated.", files.Count, result.Count); foreach (var dir in result) { Progress.SetMessage(" " + dir); } Progress.SetMessage("Please redo the smallrna_baminfo_fix with option --update in each directory and redo corresponding category analysis"); } else { Progress.SetMessage("No failed counting found."); } } return(result); }
public override SmallRNACountMap GetCountMap() { if (cm == null) { cm = new SmallRNACountMap(this.CountFile); var keys = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray(); foreach (var key in keys) { var purekey = key.StringBefore(SmallRNAConsts.NTA_TAG); cm.Counts[purekey] = cm.Counts[key]; } } return cm; }
public override IEnumerable <string> Process() { var result = new List <string>(); var except = new HashSet <string>(); if (File.Exists(options.XmlFile)) { //exclude the reads mapped to features no matter how many number of mismatch it has var allmapped = new FeatureItemGroupXmlFormat().ReadFromFile(options.XmlFile); except.UnionWith(from g in allmapped from f in g from l in f.Locations from sl in l.SamLocations select sl.SamLocation.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG)); } if (File.Exists(options.ExcludeFile)) { except.UnionWith(from l in File.ReadAllLines(options.ExcludeFile) select l.StringBefore(SmallRNAConsts.NTA_TAG)); } SmallRNACountMap cm = options.GetCountMap(); var keys = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray(); foreach (var key in keys) { cm.Counts[key.StringBefore(SmallRNAConsts.NTA_TAG)] = cm.Counts[key]; } StreamWriter swCount = null; if (File.Exists(options.CountFile)) { swCount = new StreamWriter(options.OutputFile + ".dupcount"); } Progress.SetMessage("output unmapped query..."); try { using (var sw = StreamUtils.GetWriter(options.OutputFile, options.OutputFile.ToLower().EndsWith(".gz"))) { using (var sr = StreamUtils.GetReader(options.InputFile)) { FastqReader reader = new FastqReader(); FastqWriter writer = new FastqWriter(); FastqSequence ss; var count = 0; while ((ss = reader.Parse(sr)) != null) { count++; if (count % 100000 == 0) { Progress.SetMessage("{0} reads", count); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description; if (except.Contains(ss.Name)) { continue; } if (Accept != null && !Accept(ss)) { continue; } except.Add(ss.Name); writer.Write(sw, ss); if (swCount != null) { int cmcount; if (!cm.Counts.TryGetValue(ss.Name, out cmcount)) { throw new Exception(string.Format("Cannot find {0} in count map", ss.Name)); } swCount.WriteLine("{0}\t{1}", ss.Name, cmcount); } } } } } finally { if (swCount != null) { swCount.Close(); } } Progress.End(); return(result); }
private void DoProcess(Func<FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary<int, CountItem> dic) { Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "..."); var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value)); var parser = new FastqReader(); var writer = new FastqWriter(); StreamWriter swCount = null; if (map.HasCountFile) { swCount = new StreamWriter(outputFile + ".dupcount"); swCount.WriteLine("Query\tCount\tSequence"); } try { int readcount = 0; var tmpFile = outputFile + ".tmp"; using (var sr = StreamUtils.GetReader(options.InputFile)) { using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz"))) { FastqSequence seq; while ((seq = parser.Parse(sr)) != null) { readcount++; if (readcount % 100000 == 0) { Progress.SetMessage("{0} reads processed", readcount); } if (!accept(seq)) { continue; } var name = seq.Name; var sequence = seq.SeqString; var score = seq.Score; var len = sequence.Length; var description = seq.Description; var count = map.GetCount(seq.Name); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } CountItem item; if (!dic.TryGetValue(sequence.Length, out item)) { item = new CountItem(); dic[sequence.Length] = item; } string clipped; if (sequence.EndsWith("CCAA")) { clipped = "CCAA"; sequence = sequence.Substring(0, sequence.Length - 4); item.CCAA += count; } else if (sequence.EndsWith("CCA")) { clipped = "CCA"; sequence = sequence.Substring(0, sequence.Length - 3); item.CCA += count; } else if (sequence.EndsWith("CC")) { bool isCCA; if (ccaMap.TryGetValue(name, out isCCA) && isCCA) { clipped = "CC"; sequence = sequence.Substring(0, sequence.Length - 2); item.CC += count; } else { clipped = string.Empty; item.notNTA += count; } } else { clipped = string.Empty; item.notNTA += count; } if (!string.IsNullOrEmpty(clipped)) { var newlen = sequence.Length; seq.SeqString = sequence; seq.Score = score.Substring(0, newlen); seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped); } else { seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG); } writer.Write(sw, seq); if (map.HasCountFile) { swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString); } } } } File.Move(tmpFile, outputFile); } finally { if (map.HasCountFile) { swCount.Close(); } } }
public override IEnumerable<string> Process() { var baminfofiles = Directory.GetFiles(options.RootDirectory, "*.bam.info", SearchOption.AllDirectories); var baminfogroup = baminfofiles.GroupBy(m => Path.GetDirectoryName(Path.GetDirectoryName(m))).ToList(); bool singleDirectory = false; string singleDirectoryPath = ""; var result = new List<string>(); if (baminfogroup.Count == 1) { Progress.SetMessage("Single directory mode"); singleDirectory = true; singleDirectoryPath = baminfogroup.First().Key; baminfogroup = baminfofiles.GroupBy(m => Path.GetDirectoryName(m)).ToList(); result.Add(singleDirectoryPath); } else { Progress.SetMessage("Multiple directories mode"); } Progress.SetMessage("Total {0} info files in {1} directory/directories found.", baminfofiles.Length, singleDirectory ? 1 : baminfogroup.Count); var files = new List<string>(); int count = 0; foreach (var group in baminfogroup) { count++; var file = group.First(); Progress.SetMessage("{0}/{1} : Checking {2} ...", count, baminfogroup.Count, file); var lines = File.ReadAllLines(file); var countfileline = lines.FirstOrDefault(m => m.StartsWith("#countFile")); if (string.IsNullOrWhiteSpace(countfileline)) { Progress.SetMessage(" not count file used, ignore."); continue; } var countfile = countfileline.StringAfter("\t"); if (!File.Exists(countfile)) { Progress.SetMessage(" count file {0} not exist, ignore.", countfile); continue; } var countIndex = lines.ToList().FindIndex(m => m.StartsWith("TotalReads")); var totalCountInInfoFile = int.Parse(lines[countIndex].StringAfter("\t")); var totalCountInCountFile = new SmallRNACountMap(countfile).GetTotalCount(); if (totalCountInInfoFile != totalCountInCountFile) { Progress.SetMessage(" Failed : {0} : {1} => {2}", file, totalCountInInfoFile, totalCountInCountFile); foreach (var f in group) { files.Add(f); } if (!singleDirectory) { result.Add(group.Key); } if (options.PerformUpdate && singleDirectory) { lines[countIndex] = "TotalReads\t" + totalCountInCountFile.ToString(); File.WriteAllLines(file, lines); } } } if (options.PerformUpdate) { if (!singleDirectory) { Progress.SetMessage("Updating {0} info files from {1} groups ...", files.Count, result.Count); count = 0; foreach (var file in files) { count++; Progress.SetMessage("{0}/{1}: updating {2} ...", count, files.Count, file); var lines = File.ReadAllLines(file); var countfileline = lines.FirstOrDefault(m => m.StartsWith("#countFile")); var countfile = countfileline.StringAfter("\t"); var countIndex = lines.ToList().FindIndex(m => m.StartsWith("TotalReads")); var totalCountInInfoFile = int.Parse(lines[countIndex].StringAfter("\t")); var totalCountInCountFile = new SmallRNACountMap(countfile).GetTotalCount(); if (totalCountInInfoFile != totalCountInCountFile) { lines[countIndex] = "TotalReads\t" + totalCountInCountFile.ToString(); File.WriteAllLines(file, lines); } } Progress.SetMessage("Please redo the category analysis which uses the information from following directoris :"); foreach (var dir in result) { Progress.SetMessage(" " + dir); } } else { Progress.SetMessage("Please redo the category analysis which uses the information from following directory :"); Progress.SetMessage(" " + singleDirectoryPath); } } else { if (files.Count > 0) { Progress.SetMessage("Total {0} info files from {1} groups need to be updated.", files.Count, result.Count); foreach (var dir in result) { Progress.SetMessage(" " + dir); } Progress.SetMessage("Please redo the smallrna_baminfo_fix with option --update in each directory and redo corresponding category analysis"); } else { Progress.SetMessage("No failed counting found."); } } return result; }
public override IEnumerable<string> Process() { PileupCountList pc = new PileupCountList(); var format = options.GetSAMFormat(); var cm = new SmallRNACountMap(options.CountFile); var srItems = SequenceRegionUtils.GetSequenceRegions(options.CoordinateFile, "miRNA", options.BedAsGtf); srItems.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); }); var srmap = srItems.GroupBy(m => m.Seqname).ToDictionary(m => m.Key, m => m.ToList()); StreamWriter swScript = null; try { if (options.ExportIgvScript) { swScript = new StreamWriter(options.OutputFile + ".igv"); swScript.WriteLine("snapshotDirectory {0}", Path.GetDirectoryName(options.OutputFile).Replace('\\', '/')); } using (StreamWriter sw = new StreamWriter(options.OutputFile)) { sw.WriteLine(@"##fileformat=VCFv4.2 ##fileDate={0:yyyyMMdd} ##source={1} ##phasing=partial ##INFO=<ID=NS,Number=1,Type=Integer,Description=""Number of Samples With Data""> ##INFO=<ID=DP,Number=1,Type=Integer,Description=""Total Depth""> ##INFO=<ID=AF,Number=A,Type=Float,Description=""Allele Frequency""> ##INFO=<ID=FP,Number=1,Type=Float,Description=""Fisher Exact Test P-Value""> ##INFO=<ID=MN,Number=.,Type=String,Description=""miRNA name contains this position""> ##FILTER=<ID=FisherET,Description=""Fisher exact test Pvalue less than {2}""> ##FILTER=<ID=AltAlleFreq,Description=""Alternative allele frequency less than {3}""> ##FILTER=<ID=notMiRNA,Description=""Position not located in miRNA locus""> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description=""Read Depth""> ##FORMAT=<ID=AD,Number=1,Type=Integer,Description=""Allelic Depth""> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT {4}", DateTime.Now, "PileupCountBuilder", options.FisherPValue, options.MinimumAlternativeAlleleFrequency, Path.GetFileNameWithoutExtension(options.InputFile)); using (var sr = SAMFactory.GetReader(options.InputFile, true)) { int count = 0; string line; while ((line = sr.ReadLine()) != null) { count++; if (count % 100 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (count % 100000 == 0) { Progress.SetMessage("{0} reads processed", count); } var parts = line.Split('\t'); var qname = parts[SAMFormatConst.QNAME_INDEX]; var seq = parts[SAMFormatConst.SEQ_INDEX]; //too short if (seq.Length < options.MinimumReadLength) { continue; } SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } var cigar = parts[SAMFormatConst.CIGAR_INDEX]; //insertion/deletion if (cigar.Any(m => m == 'I' || m == 'D')) { continue; } var sam = new SAMAlignedItem() { Qname = qname, }; bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand); char strand; if (isReversed) { strand = '-'; sam.Sequence = SequenceUtils.GetReverseComplementedSequence(seq); } else { strand = '+'; sam.Sequence = seq; } var loc = new SAMAlignedLocation(sam) { Seqname = parts[SAMFormatConst.RNAME_INDEX].StringAfter("chr"), Start = int.Parse(parts[SAMFormatConst.POS_INDEX]), Strand = strand, Cigar = parts[SAMFormatConst.CIGAR_INDEX], MismatchPositions = format.GetMismatchPositions(parts), NumberOfMismatch = format.GetNumberOfMismatch(parts), Sequence = seq }; loc.ParseEnd(sam.Sequence); sam.AddLocation(loc); if (format.HasAlternativeHits) { format.ParseAlternativeHits(parts, sam); } var finished = pc.Add(sam, cm.GetCount(sam.Qname)); if (null == finished || 0 == finished.Count) { continue; } foreach (var fin in finished) { //if (fin.Chromosome.Equals("1") && fin.Position == 5160725) //{ // Console.WriteLine(fin); //} var ft = fin.FisherExactTest(); if (ft.PValue <= options.FisherPValue) { var total = fin.Sum(m => m.Value); var minallele = total * options.MinimumAlternativeAlleleFrequency; if (ft.Sample2.Failed >= minallele) { List<GtfItem> srs; List<string> ranges = new List<string>(); if (srmap.TryGetValue(sam.Locations[0].Seqname, out srs)) { foreach (var seqr in srs) { if (seqr.Contains(fin.Position)) { ranges.Add(seqr.GetNameLocation()); } } } var alter = (from r in fin where r.Key != fin.Reference orderby r.Key select r).ToList(); var str = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\tNS={7};DP={8};AF={9};FP={10:0.##E0}{11}\tDP:AD\t{12}:{13},{14}", fin.Chromosome, fin.Position, ".", fin.Reference, (from r in alter select r.Key.ToString()).Merge(","), 0, ranges.Count == 0 ? "notMiRNA" : "PASS", 1, total, (from r in alter select string.Format("{0:0.###}", r.Value * 1.0 / total)).Merge(","), ft.PValue, ranges.Count == 0 ? "" : ";" + ranges.Merge(","), total, ft.Sample2.Succeed, (from r in alter select r.Value.ToString()).Merge(",")); sw.WriteLine(str); //Console.WriteLine(str); if (swScript != null && ranges.Count > 0) { swScript.WriteLine(@"goto {0}:{1} sort position snapshot {0}_{2}_{1}.png", fin.Chromosome, fin.Position, ranges[0].Replace('(', '_').Replace(')', '_').Replace(':', '_')); } } } } finished.Clear(); } } } } finally { if (swScript != null) { swScript.Close(); } } return new string[] { options.OutputFile }; }