public List<ChromosomeCountSlimItem> Build(string fileName) { if (File.Exists(options.CategoryMapFile)) { Progress.SetMessage("Reading name map file " + options.CategoryMapFile + " ..."); nameMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile).ToDictionary(m => m.Key, m => m.Value.Value); } var result = new List<ChromosomeCountSlimItem>(); var queries = new Dictionary<string, SAMChromosomeItem>(); var chromosomes = new Dictionary<string, ChromosomeCountSlimItem>(); Progress.SetMessage("Parsing alignment file " + fileName + " ..."); using (var sr = SAMFactory.GetReader(fileName, true)) { int count = 0; int waitingcount = 0; string line; while ((line = sr.ReadLine()) != null) { if (count % 1000 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (count % 100000 == 0 && count > 0) { Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count); } count++; var parts = line.Split('\t'); SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } var qname = parts[SAMFormatConst.QNAME_INDEX]; SAMChromosomeItem query; if (!queries.TryGetValue(qname, out query)) { query = new SAMChromosomeItem(); query.Qname = qname; queries[qname] = query; if (options.KeepSequence) { query.Sequence = parts[SAMFormatConst.SEQ_INDEX]; if (flag.HasFlag(SAMFlags.QueryOnReverseStrand)) { query.Sequence = SequenceUtils.GetReverseComplementedSequence(query.Sequence); } } } var seqname = GetName(parts[SAMFormatConst.RNAME_INDEX]); query.Chromosomes.Add(seqname); ChromosomeCountSlimItem item; if (!chromosomes.TryGetValue(seqname, out item)) { item = new ChromosomeCountSlimItem(); item.Names.Add(seqname); chromosomes[seqname] = item; result.Add(item); } item.Queries.Add(query); waitingcount++; } Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count); } foreach (var query in queries.Values) { query.Chromosomes = query.Chromosomes.Distinct().OrderBy(m => m).ToList(); } foreach (var sam in chromosomes.Values) { sam.Queries = sam.Queries.Distinct().OrderBy(m => m.Qname).ToList(); } if (!string.IsNullOrEmpty(options.PreferPrefix)) { foreach (var query in queries.Values) { if (query.Chromosomes.Any(l => l.StartsWith(options.PreferPrefix))) { var chroms = query.Chromosomes.Where(l => l.StartsWith(options.PreferPrefix)).ToArray(); foreach (var chrom in chroms) { chromosomes[chrom].Queries.Remove(query); query.Chromosomes.Remove(chrom); } } } result.RemoveAll(l => l.Queries.Count == 0); } return result; }
public override IEnumerable<string> Process() { var result = new List<string>(); var countFiles = options.GetCountFiles(); countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name)); var format = new ChromosomeCountSlimItemXmlFormat(outputSample: true); var countMap = new Dictionary<string, ChromosomeCountSlimItem>(); int fileIndex = 0; foreach (var file in countFiles) { fileIndex++; Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File); var curcounts = format.ReadFromFile(file.File); if (curcounts.Count > 0 && string.IsNullOrEmpty(curcounts[0].Queries[0].Sequence)) { Console.WriteLine("Didn't read in the sequence of query " + curcounts[0].Queries[0].Qname); } curcounts.ForEach(m => { foreach (var q in m.Queries) { q.Sample = file.Name; } }); foreach (var c in curcounts) { var name = c.Names.First(); ChromosomeCountSlimItem item; if (countMap.TryGetValue(name, out item)) { item.Queries.AddRange(c.Queries); } else { countMap[name] = c; } } } var counts = countMap.Values.ToList(); WriteOutput(options.OutputFile, countFiles, format, counts); result.Add(options.OutputFile); if (File.Exists(options.CategoryMapFile)) { Progress.SetMessage("Reading category map ..."); var categoryMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile); var queries = new HashSet<SAMChromosomeItem>(from c in counts from q in c.Queries select q); var dic = new Dictionary<string, ChromosomeCountSlimItem>(); foreach (var q in queries) { q.Chromosomes = (from chrom in q.Chromosomes select categoryMap[chrom].Value).Distinct().OrderBy(m => m).ToList(); foreach (var chrom in q.Chromosomes) { ChromosomeCountSlimItem item; if (!dic.TryGetValue(chrom, out item)) { item = new ChromosomeCountSlimItem(); item.Names.Add(chrom); dic[chrom] = item; } item.Queries.Add(q); } } var catFile = Path.ChangeExtension(options.OutputFile, ".category" + Path.GetExtension(options.OutputFile)); WriteOutput(catFile, countFiles, format, dic.Values.ToList()); result.Add(catFile); } if (options.OutputReadTable || options.OutputReadContigTable) { Progress.SetMessage("Building sequence map..."); var reads = SmallRNASequenceUtils.ConvertFrom(counts); if (options.OutputReadTable) { Progress.SetMessage("Saving read file..."); var readOutput = Path.ChangeExtension(options.OutputFile, ".read" + Path.GetExtension(options.OutputFile)); new SmallRNASequenceFormat(int.MaxValue, false).WriteToFile(readOutput, reads); result.Add(readOutput); } if (options.OutputReadContigTable) { Progress.SetMessage("Building sequence contig by similarity ..."); var contigs = SmallRNASequenceUtils.BuildContigByIdenticalSimilarity(reads, options.MinimumOverlapRate, options.MaximumExtensionBase, progress: Progress); Progress.SetMessage("Contig number = {0}", contigs.Count); Progress.SetMessage("Saving contig file..."); var contigOutput = Path.ChangeExtension(options.OutputFile, ".contig" + Path.GetExtension(options.OutputFile)); new SmallRNASequenceContigFormat().WriteToFile(contigOutput, contigs); result.Add(contigOutput); Progress.SetMessage("Saving sequence contig details..."); new SmallRNASequenceContigDetailFormat().WriteToFile(contigOutput + ".details", contigs); result.Add(contigOutput + ".details"); } } Progress.End(); return result; }
public List <ChromosomeCountSlimItem> Build(string fileName) { if (File.Exists(options.CategoryMapFile)) { Progress.SetMessage("Reading name map file " + options.CategoryMapFile + " ..."); nameMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile).ToDictionary(m => m.Key, m => m.Value.Value); } var result = new List <ChromosomeCountSlimItem>(); var queries = new Dictionary <string, SAMChromosomeItem>(); var chromosomes = new Dictionary <string, ChromosomeCountSlimItem>(); Regex chromosomeRegex = null; Func <string, bool> acceptChromosome; if (string.IsNullOrEmpty(options.ChromosomePattern)) { acceptChromosome = m => true; } else { chromosomeRegex = new Regex(options.ChromosomePattern); acceptChromosome = m => chromosomeRegex.Match(m).Success; } Progress.SetMessage("Parsing alignment file " + fileName + " ..."); using (var sr = SAMFactory.GetReader(fileName, true)) { int count = 0; int waitingcount = 0; string line; while ((line = sr.ReadLine()) != null) { if (count % 1000 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (count % 100000 == 0 && count > 0) { Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count); } count++; var parts = line.Split('\t'); SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } var seqname = GetName(parts[SAMFormatConst.RNAME_INDEX]); if (!acceptChromosome(seqname)) { continue; } var qname = parts[SAMFormatConst.QNAME_INDEX]; SAMChromosomeItem query; if (!queries.TryGetValue(qname, out query)) { query = new SAMChromosomeItem(); query.Qname = qname; queries[qname] = query; if (options.KeepSequence) { query.Sequence = parts[SAMFormatConst.SEQ_INDEX]; if (flag.HasFlag(SAMFlags.QueryOnReverseStrand)) { query.Sequence = SequenceUtils.GetReverseComplementedSequence(query.Sequence); } } } query.Chromosomes.Add(seqname); ChromosomeCountSlimItem item; if (!chromosomes.TryGetValue(seqname, out item)) { item = new ChromosomeCountSlimItem(); item.Names.Add(seqname); chromosomes[seqname] = item; result.Add(item); } item.Queries.Add(query); waitingcount++; } Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count); } foreach (var query in queries.Values) { query.Chromosomes = query.Chromosomes.Distinct().OrderBy(m => m).ToList(); } foreach (var sam in chromosomes.Values) { sam.Queries = sam.Queries.Distinct().OrderBy(m => m.Qname).ToList(); } if (!string.IsNullOrEmpty(options.PreferPrefix)) { foreach (var query in queries.Values) { if (query.Chromosomes.Any(l => l.StartsWith(options.PreferPrefix))) { var chroms = query.Chromosomes.Where(l => l.StartsWith(options.PreferPrefix)).ToArray(); foreach (var chrom in chroms) { chromosomes[chrom].Queries.Remove(query); query.Chromosomes.Remove(chrom); } } } result.RemoveAll(l => l.Queries.Count == 0); } return(result); }
public override IEnumerable <string> Process() { var result = new List <string>(); var countFiles = options.GetCountFiles(); countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name)); var format = new ChromosomeCountSlimItemXmlFormat(outputSample: true); var countMap = new Dictionary <string, ChromosomeCountSlimItem>(); int fileIndex = 0; foreach (var file in countFiles) { fileIndex++; Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File); var curcounts = format.ReadFromFile(file.File); if (curcounts.Count > 0 && string.IsNullOrEmpty(curcounts[0].Queries[0].Sequence)) { Console.WriteLine("Didn't read in the sequence of query " + curcounts[0].Queries[0].Qname); } curcounts.ForEach(m => { foreach (var q in m.Queries) { q.Sample = file.Name; } }); foreach (var c in curcounts) { var name = c.Names.First(); ChromosomeCountSlimItem item; if (countMap.TryGetValue(name, out item)) { item.Queries.AddRange(c.Queries); } else { countMap[name] = c; } } } var counts = countMap.Values.ToList(); WriteOutput(options.OutputFile, countFiles, format, counts); result.Add(options.OutputFile); if (File.Exists(options.CategoryMapFile)) { Progress.SetMessage("Reading category map ..."); var categoryMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile); var queries = new HashSet <SAMChromosomeItem>(from c in counts from q in c.Queries select q); var dic = new Dictionary <string, ChromosomeCountSlimItem>(); foreach (var q in queries) { q.Chromosomes = (from chrom in q.Chromosomes select categoryMap[chrom].Value).Distinct().OrderBy(m => m).ToList(); foreach (var chrom in q.Chromosomes) { ChromosomeCountSlimItem item; if (!dic.TryGetValue(chrom, out item)) { item = new ChromosomeCountSlimItem(); item.Names.Add(chrom); dic[chrom] = item; } item.Queries.Add(q); } } var catFile = Path.ChangeExtension(options.OutputFile, ".category" + Path.GetExtension(options.OutputFile)); WriteOutput(catFile, countFiles, format, dic.Values.ToList()); result.Add(catFile); } if (options.OutputReadTable || options.OutputReadContigTable) { Progress.SetMessage("Building sequence map..."); var reads = SmallRNASequenceUtils.ConvertFrom(counts); if (options.OutputReadTable) { Progress.SetMessage("Saving read file..."); var readOutput = Path.ChangeExtension(options.OutputFile, ".read" + Path.GetExtension(options.OutputFile)); new SmallRNASequenceFormat(int.MaxValue, false).WriteToFile(readOutput, reads); result.Add(readOutput); } if (options.OutputReadContigTable) { Progress.SetMessage("Building sequence contig by similarity ..."); var contigs = SmallRNASequenceUtils.BuildContigByIdenticalSimilarity(reads, options.MinimumOverlapRate, options.MaximumExtensionBase, progress: Progress); Progress.SetMessage("Contig number = {0}", contigs.Count); Progress.SetMessage("Saving contig file..."); var contigOutput = Path.ChangeExtension(options.OutputFile, ".contig" + Path.GetExtension(options.OutputFile)); new SmallRNASequenceContigFormat().WriteToFile(contigOutput, contigs); result.Add(contigOutput); Progress.SetMessage("Saving sequence contig details..."); new SmallRNASequenceContigDetailFormat().WriteToFile(contigOutput + ".details", contigs); result.Add(contigOutput + ".details"); } } Progress.End(); return(result); }