protected ReadSummary GetReadSummary(List <FeatureItemGroup> allmapped, HashSet <string> excludeQueries, List <SAMAlignedItem> reads, List <QueryInfo> totalQueries) { var result = new ReadSummary(); if (File.Exists(options.CountFile)) { result.TotalRead = Counts.GetTotalCount(); } else { result.TotalRead = totalQueries.Count; } var featureQueries = new HashSet <string>(from fig in allmapped from fi in fig from loc in fi.Locations from sl in loc.SamLocations select sl.SamLocation.Parent.OriginalQname); result.FeatureRead = featureQueries.Sum(l => Counts.GetCount(l)); result.ExcludeRead = excludeQueries.Sum(l => Counts.GetCount(l)); result.GenomeRead = (from query in totalQueries where (!query.Name.Contains(SmallRNAConsts.NTA_TAG) || query.Name.EndsWith(SmallRNAConsts.NTA_TAG)) let originalQname = query.Name.StringBefore(SmallRNAConsts.NTA_TAG) where !featureQueries.Contains(originalQname) && query.Mismatch == 0 && query.Length >= options.TooShortReadLength select originalQname).Distinct().Sum(m => Counts.GetCount(m)); if (Counts.ItemMap != null) { result.TooShortRead = (from read in Counts.ItemMap.Values where !featureQueries.Contains(read.Qname) && read.SequenceLength < 20 select read.Count).Sum(); } else { result.TooShortRead = 0; } return(result); }
public override IEnumerable <string> Process() { var result = new List <string>(); //read regions var featureLocations = options.GetSequenceRegions(); Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count); if (featureLocations.Count == 0) { throw new Exception(string.Format("No coordinate found for {0} in file {1}", options.GtfFeatureName, options.CoordinateFile)); } //group features by chromosome featureMap = featureLocations.ToGroupDictionary(m => m.Seqname); var resultFilename = options.OutputFile; result.Add(resultFilename); //parsing reads List <QueryInfo> totalQueries; var reads = ParseCandidates(options.InputFile, resultFilename, out totalQueries); int totalReadCount = (from q in totalQueries select q.Name.StringBefore(SmallRNAConsts.NTA_TAG)).Distinct().Sum(l => Counts.GetCount(l)); if (reads.Count > 0 && reads[0].Qname.Contains(SmallRNAConsts.NTA_TAG)) { if (!options.NTA) { reads.RemoveAll(m => !m.Qname.EndsWith(SmallRNAConsts.NTA_TAG)); } } int mappedReadCount = reads.Sum(l => Counts.GetCount(l.Qname)); Progress.SetMessage("mapping reads to sequence regions..."); MapReadToSequenceRegion(featureLocations, reads); var featureReadCount = reads.Where(m => m.Locations.Any(n => n.Features.Count > 0)).Sum(m => m.QueryCount); Console.WriteLine("feature reads = {0}", featureReadCount); var mappedItems = featureLocations.GroupByName(); mappedItems.RemoveAll(m => m.GetEstimatedCount() == 0); mappedItems.ForEach(m => m.CombineLocations()); var mappedGroups = mappedItems.GroupByIdenticalQuery(); //group by miRNA name if (!options.NoMappedFile) { Progress.SetMessage("output mapping details..."); var mappedfile = resultFilename + ".mapped.xml"; new FeatureItemGroupXmlFormat().WriteToFile(mappedfile, mappedGroups); result.Add(mappedfile); } Progress.SetMessage("write result ..."); new FeatureItemGroupCountWriter().WriteToFile(resultFilename, mappedGroups); if (options.ExportLengthDistribution) { var disfile = resultFilename + ".length"; new FeatureItemGroupReadLengthWriter().WriteToFile(disfile, mappedGroups); result.Add(disfile); } if (options.ExportSequenceCount) { var seqfile = resultFilename + ".seqcount"; new FeatureItemGroupSequenceWriter().WriteToFile(seqfile, mappedGroups); result.Add(seqfile); } if (options.UnmappedFastq) { Progress.SetMessage("output unmapped query..."); var unmappedFile = Path.ChangeExtension(resultFilename, ".unmapped.fastq.gz"); var except = new HashSet <string>(from r in reads where r.Locations.Count > 0 select r.Qname); if (File.Exists(options.FastqFile)) { new FastqExtractorFromFastq { Progress = Progress }.Extract(options.FastqFile, unmappedFile, except, options.CountFile); } else { new FastqExtractorFromBam() { Progress = Progress }.Extract(options.InputFile, unmappedFile, except, options.CountFile); } result.Add(unmappedFile); } Progress.SetMessage("summarizing ..."); var infoFile = Path.ChangeExtension(resultFilename, ".info"); using (var sw = new StreamWriter(infoFile)) { sw.WriteLine("#file\t{0}", options.InputFile); sw.WriteLine("#coordinate\t{0}", options.CoordinateFile); sw.WriteLine("#minLength\t{0}", options.MinimumReadLength); sw.WriteLine("#maxMismatchCount\t{0}", options.MaximumMismatch); if (File.Exists(options.CountFile)) { sw.WriteLine("#countFile\t{0}", options.CountFile); } sw.WriteLine("TotalReads\t{0}", totalReadCount); sw.WriteLine("MappedReads\t{0}", mappedReadCount); sw.WriteLine("MultipleMappedReads\t{0}", reads.Where(m => m.Locations.Count > 1).Sum(m => m.QueryCount)); sw.WriteLine("FeatureReads\t{0}", featureReadCount); } result.Add(infoFile); Progress.End(); return(result); }
public override IEnumerable <string> Process() { var result = new List <string>(); //read regions var featureLocations = options.GetSequenceRegions(); Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count); if (featureLocations.Count == 0) { throw new Exception(string.Format("No coordinate found in file {1}", options.CoordinateFile)); } var trnaLocations = featureLocations.Where(l => l.Category.Equals(SmallRNAConsts.tRNA)).ToList(); var mirnaLocations = featureLocations.Where(l => l.Category.Equals(SmallRNAConsts.miRNA)).ToList(); var notTrnaLocations = featureLocations.Where(l => !l.Category.Equals(SmallRNAConsts.tRNA)).ToList(); var resultFilename = options.OutputFile; result.Add(resultFilename); Progress.SetMessage("Parsing tRNA alignment result ..."); //Parsing reads List <QueryInfo> trnaQueries; var trnaReads = ParseCandidates(options.InputFiles, resultFilename, out trnaQueries); SmallRNAUtils.InitializeSmallRnaNTA(trnaReads); var hasNTA = trnaReads.Any(l => l.NTA.Length > 0); List <QueryInfo> otherrnaQueries; var otherRNAReads = ParseCandidates(options.OtherFile, resultFilename + ".other", out otherrnaQueries); SmallRNAUtils.InitializeSmallRnaNTA(otherRNAReads); var featureGroups = new List <FeatureItemGroup>(); var mappedfile = resultFilename + ".mapped.xml"; if (File.Exists(mappedfile) && options.NotOverwrite) { Progress.SetMessage("Reading mapped feature items..."); featureGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedfile); } else { Progress.SetMessage("Mapping to tRNA..."); //Draw tRNA mapping position graph Progress.SetMessage("Drawing tRNA position pictures..."); var tRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".position"); if (!options.NotOverwrite || !File.Exists(tRNAPositionFile)) { DrawPositionImage(trnaReads, trnaLocations, "tRNA", tRNAPositionFile); } //Map reads to tRNA MapReadToSequenceRegion(trnaLocations, trnaReads, hasNTA); var trnaMapped = trnaLocations.GroupByName(); trnaMapped.RemoveAll(m => m.GetEstimatedCount() == 0); trnaMapped.ForEach(m => m.CombineLocations()); var trnaGroups = trnaMapped.GroupByIdenticalQuery(); if (trnaGroups.Count > 0) { Progress.SetMessage("Writing tRNA count ..."); var trnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.tRNA + ".count"); OrderFeatureItemGroup(trnaGroups); new FeatureItemGroupTIGRTCountWriter().WriteToFile(trnaCountFile, trnaGroups); result.Add(trnaCountFile); featureGroups.AddRange(trnaGroups); } //Get all queries mapped to tRNA var tRNAreads = new HashSet <string>(from read in SmallRNAUtils.GetMappedReads(trnaLocations) select read.OriginalQname); //Remove all reads mapped to tRNA otherRNAReads.RemoveAll(m => tRNAreads.Contains(m.OriginalQname)); //Draw miRNA mapping position graph Progress.SetMessage("Drawing miRNA position pictures..."); var miRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".position"); if (!options.NotOverwrite || !File.Exists(miRNAPositionFile)) { DrawPositionImage(otherRNAReads, mirnaLocations, "miRNA", miRNAPositionFile); } //Map reads to not tRNA MapReadToSequenceRegion(notTrnaLocations, otherRNAReads, hasNTA); var notTrnaMapped = notTrnaLocations.GroupByName(); notTrnaMapped.RemoveAll(m => m.GetEstimatedCount() == 0); notTrnaMapped.ForEach(m => m.CombineLocations()); var mirnaGroups = notTrnaMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence(); if (mirnaGroups.Count > 0) { Progress.SetMessage("writing miRNA count ..."); OrderFeatureItemGroup(mirnaGroups); var mirnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.miRNA + ".count"); new SmallRNACountMicroRNAWriter(options.Offsets).WriteToFile(mirnaCountFile, mirnaGroups); result.Add(mirnaCountFile); featureGroups.AddRange(mirnaGroups); } var otherGroups = notTrnaMapped.Where(m => !m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupByIdenticalQuery(); if (otherGroups.Count > 0) { Progress.SetMessage("writing other smallRNA count ..."); var otherCountFile = Path.ChangeExtension(resultFilename, ".other.count"); OrderFeatureItemGroup(otherGroups); new FeatureItemGroupTIGRTCountWriter().WriteToFile(otherCountFile, otherGroups); result.Add(otherCountFile); featureGroups.AddRange(otherGroups); } Progress.SetMessage("writing all smallRNA count ..."); new FeatureItemGroupTIGRTCountWriter().WriteToFile(resultFilename, featureGroups); result.Add(resultFilename); Progress.SetMessage("writing mapping details..."); new FeatureItemGroupXmlFormat().WriteToFile(mappedfile, featureGroups); result.Add(mappedfile); } var readSummary = GetReadSummary(featureGroups, new HashSet <string>(), trnaReads.Union(otherRNAReads).ToList(), trnaQueries.Union(otherrnaQueries).ToList()); var totalQueryCount = (from q in trnaQueries.Union(otherrnaQueries) select q.Name.StringBefore(SmallRNAConsts.NTA_TAG)).Distinct().Sum(m => Counts.GetCount(m)); var totalMappedCount = (from q in trnaReads select q.OriginalQname).Union(from q in otherRNAReads select q.OriginalQname).Distinct().Sum(m => Counts.GetCount(m)); var infoFile = Path.ChangeExtension(resultFilename, ".info"); WriteSummaryFile(infoFile, readSummary, featureGroups); result.Add(infoFile); Progress.End(); return(result); }