protected ReadSummary GetReadSummary(List <FeatureItemGroup> allmapped, HashSet <string> excludeQueries, List <SAMAlignedItem> reads, List <QueryInfo> totalQueries) { var result = new ReadSummary(); if (File.Exists(options.CountFile)) { result.TotalRead = Counts.GetTotalCount(); } else { result.TotalRead = totalQueries.Count; } var featureQueries = new HashSet <string>(from fig in allmapped from fi in fig from loc in fi.Locations from sl in loc.SamLocations select sl.SamLocation.Parent.OriginalQname); result.FeatureRead = featureQueries.Sum(l => Counts.GetCount(l)); result.ExcludeRead = excludeQueries.Sum(l => Counts.GetCount(l)); result.GenomeRead = (from query in totalQueries where (!query.Name.Contains(SmallRNAConsts.NTA_TAG) || query.Name.EndsWith(SmallRNAConsts.NTA_TAG)) let originalQname = query.Name.StringBefore(SmallRNAConsts.NTA_TAG) where !featureQueries.Contains(originalQname) && query.Mismatch == 0 && query.Length >= options.TooShortReadLength select originalQname).Distinct().Sum(m => Counts.GetCount(m)); if (Counts.ItemMap != null) { result.TooShortRead = (from read in Counts.ItemMap.Values where !featureQueries.Contains(read.Qname) && read.SequenceLength < 20 select read.Count).Sum(); } else { result.TooShortRead = 0; } return(result); }
public override IEnumerable <string> Process() { var result = new List <string>(); //read regions var featureLocations = this.MappedOptions.GetSequenceRegions(); Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count); if (featureLocations.Count == 0) { throw new Exception(string.Format("No coordinate found for {0} in file {1}", options.GtfFeatureName, options.CoordinateFile)); } var resultFilename = options.OutputFile; result.Add(resultFilename); //parsing reads List <QueryInfo> totalQueries; var reads = ParseCandidates(options.InputFile, resultFilename, out totalQueries); int totalQueryCount; if (reads.Count == totalQueries.Count && File.Exists(options.CountFile)) //only mapped reads in bam file. { totalQueryCount = Counts.GetTotalCount(); } else { totalQueryCount = (from q in totalQueries select q.Name.StringBefore(SmallRNAConsts.NTA_TAG)).Distinct().Sum(m => Counts.GetCount(m)); } if (reads.Count > 0 && reads[0].Qname.Contains(SmallRNAConsts.NTA_TAG)) { if (!options.NTA) { reads.RemoveAll(m => !m.Qname.EndsWith(SmallRNAConsts.NTA_TAG)); } } var totalMappedCount = (from q in reads select q.Qname.StringBefore(SmallRNAConsts.NTA_TAG)).Distinct().Sum(m => Counts.GetCount(m)); Progress.SetMessage("mapping reads to sequence regions..."); MapReadToSequenceRegion(featureLocations, reads); var featureReadCount = reads.Where(m => m.Locations.Any(n => n.Features.Count > 0)).Sum(m => m.QueryCount); Console.WriteLine("feature reads = {0}", featureReadCount); var mappedItems = featureLocations.GroupByName(); mappedItems.RemoveAll(m => m.GetEstimatedCount() == 0); mappedItems.ForEach(m => m.CombineLocations()); var mappedGroups = mappedItems.GroupByIdenticalQuery(); //group by miRNA name if (!options.NoMappedFile) { Progress.SetMessage("output mapping details..."); var mappedfile = resultFilename + ".mapped.xml"; new FeatureItemGroupXmlFormat().WriteToFile(mappedfile, mappedGroups); result.Add(mappedfile); } Progress.SetMessage("write result ..."); mappedGroups.Sort((m1, m2) => m2.GetEstimatedCount().CompareTo(m1.GetEstimatedCount())); new FeatureItemGroupCountWriter().WriteToFile(resultFilename, mappedGroups); if (options.ExportLengthDistribution) { var disfile = resultFilename + ".length"; new FeatureItemGroupReadLengthWriter().WriteToFile(disfile, mappedGroups); result.Add(disfile); } if (options.ExportSequenceCount) { var seqfile = resultFilename + ".seqcount"; new FeatureItemGroupSequenceWriter().WriteToFile(seqfile, mappedGroups); result.Add(seqfile); } if (options.UnmappedFastq) { Progress.SetMessage("output unmapped query..."); var unmappedFile = Path.ChangeExtension(resultFilename, ".unmapped.fastq.gz"); var except = new HashSet <string>(from r in reads where r.Locations.Count > 0 select r.Qname); if (File.Exists(options.FastqFile)) { new FastqExtractorFromFastq { Progress = Progress }.Extract(options.FastqFile, unmappedFile, except, options.CountFile); } else { new FastqExtractorFromBam() { Progress = Progress }.Extract(options.InputFile, unmappedFile, except, options.CountFile); } result.Add(unmappedFile); } Progress.SetMessage("summarizing ..."); var infoFile = Path.ChangeExtension(resultFilename, ".info"); using (var sw = new StreamWriter(infoFile)) { sw.WriteLine("#file\t{0}", options.InputFile); sw.WriteLine("#coordinate\t{0}", options.CoordinateFile); sw.WriteLine("#minLength\t{0}", options.MinimumReadLength); sw.WriteLine("#maxMismatchCount\t{0}", options.MaximumMismatch); if (File.Exists(options.CountFile)) { sw.WriteLine("#countFile\t{0}", options.CountFile); } sw.WriteLine("TotalReads\t{0}", totalQueryCount); sw.WriteLine("MappedReads\t{0}", totalMappedCount); sw.WriteLine("MultipleMappedReads\t{0}", reads.Where(m => m.Locations.Count > 1).Sum(m => m.QueryCount)); sw.WriteLine("FeatureReads\t{0}", featureReadCount); } result.Add(infoFile); Progress.End(); return(result); }