protected ReadSummary GetReadSummary(List <FeatureItemGroup> allmapped, HashSet <string> excludeQueries, List <SAMAlignedItem> reads, List <QueryInfo> totalQueries)
        {
            var result = new ReadSummary();

            if (File.Exists(options.CountFile))
            {
                result.TotalRead = Counts.GetTotalCount();
            }
            else
            {
                result.TotalRead = totalQueries.Count;
            }

            var featureQueries = new HashSet <string>(from fig in allmapped
                                                      from fi in fig
                                                      from loc in fi.Locations
                                                      from sl in loc.SamLocations
                                                      select sl.SamLocation.Parent.OriginalQname);

            result.FeatureRead = featureQueries.Sum(l => Counts.GetCount(l));

            result.ExcludeRead = excludeQueries.Sum(l => Counts.GetCount(l));

            result.GenomeRead = (from query in totalQueries
                                 where (!query.Name.Contains(SmallRNAConsts.NTA_TAG) || query.Name.EndsWith(SmallRNAConsts.NTA_TAG))
                                 let originalQname = query.Name.StringBefore(SmallRNAConsts.NTA_TAG)
                                                     where !featureQueries.Contains(originalQname) && query.Mismatch == 0 && query.Length >= options.TooShortReadLength
                                                     select originalQname).Distinct().Sum(m => Counts.GetCount(m));

            if (Counts.ItemMap != null)
            {
                result.TooShortRead = (from read in Counts.ItemMap.Values
                                       where !featureQueries.Contains(read.Qname) && read.SequenceLength < 20
                                       select read.Count).Sum();
            }
            else
            {
                result.TooShortRead = 0;
            }

            return(result);
        }
Beispiel #2
0
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            //read regions
            var featureLocations = options.GetSequenceRegions();

            Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count);
            if (featureLocations.Count == 0)
            {
                throw new Exception(string.Format("No coordinate found for {0} in file {1}", options.GtfFeatureName,
                                                  options.CoordinateFile));
            }

            //group features by chromosome
            featureMap = featureLocations.ToGroupDictionary(m => m.Seqname);

            var resultFilename = options.OutputFile;

            result.Add(resultFilename);

            //parsing reads
            List <QueryInfo> totalQueries;
            var reads          = ParseCandidates(options.InputFile, resultFilename, out totalQueries);
            int totalReadCount = (from q in totalQueries
                                  select q.Name.StringBefore(SmallRNAConsts.NTA_TAG)).Distinct().Sum(l => Counts.GetCount(l));

            if (reads.Count > 0 && reads[0].Qname.Contains(SmallRNAConsts.NTA_TAG))
            {
                if (!options.NTA)
                {
                    reads.RemoveAll(m => !m.Qname.EndsWith(SmallRNAConsts.NTA_TAG));
                }
            }
            int mappedReadCount = reads.Sum(l => Counts.GetCount(l.Qname));

            Progress.SetMessage("mapping reads to sequence regions...");
            MapReadToSequenceRegion(featureLocations, reads);

            var featureReadCount = reads.Where(m => m.Locations.Any(n => n.Features.Count > 0)).Sum(m => m.QueryCount);

            Console.WriteLine("feature reads = {0}", featureReadCount);

            var mappedItems = featureLocations.GroupByName();

            mappedItems.RemoveAll(m => m.GetEstimatedCount() == 0);

            mappedItems.ForEach(m => m.CombineLocations());

            var mappedGroups = mappedItems.GroupByIdenticalQuery();

            //group by miRNA name
            if (!options.NoMappedFile)
            {
                Progress.SetMessage("output mapping details...");
                var mappedfile = resultFilename + ".mapped.xml";
                new FeatureItemGroupXmlFormat().WriteToFile(mappedfile, mappedGroups);
                result.Add(mappedfile);
            }

            Progress.SetMessage("write result ...");
            new FeatureItemGroupCountWriter().WriteToFile(resultFilename, mappedGroups);

            if (options.ExportLengthDistribution)
            {
                var disfile = resultFilename + ".length";
                new FeatureItemGroupReadLengthWriter().WriteToFile(disfile, mappedGroups);
                result.Add(disfile);
            }

            if (options.ExportSequenceCount)
            {
                var seqfile = resultFilename + ".seqcount";
                new FeatureItemGroupSequenceWriter().WriteToFile(seqfile, mappedGroups);
                result.Add(seqfile);
            }

            if (options.UnmappedFastq)
            {
                Progress.SetMessage("output unmapped query...");
                var unmappedFile = Path.ChangeExtension(resultFilename, ".unmapped.fastq.gz");
                var except       = new HashSet <string>(from r in reads
                                                        where r.Locations.Count > 0
                                                        select r.Qname);

                if (File.Exists(options.FastqFile))
                {
                    new FastqExtractorFromFastq {
                        Progress = Progress
                    }.Extract(options.FastqFile, unmappedFile, except, options.CountFile);
                }
                else
                {
                    new FastqExtractorFromBam()
                    {
                        Progress = Progress
                    }.Extract(options.InputFile, unmappedFile, except, options.CountFile);
                }
                result.Add(unmappedFile);
            }

            Progress.SetMessage("summarizing ...");
            var infoFile = Path.ChangeExtension(resultFilename, ".info");

            using (var sw = new StreamWriter(infoFile))
            {
                sw.WriteLine("#file\t{0}", options.InputFile);
                sw.WriteLine("#coordinate\t{0}", options.CoordinateFile);
                sw.WriteLine("#minLength\t{0}", options.MinimumReadLength);
                sw.WriteLine("#maxMismatchCount\t{0}", options.MaximumMismatch);
                if (File.Exists(options.CountFile))
                {
                    sw.WriteLine("#countFile\t{0}", options.CountFile);
                }
                sw.WriteLine("TotalReads\t{0}", totalReadCount);
                sw.WriteLine("MappedReads\t{0}", mappedReadCount);
                sw.WriteLine("MultipleMappedReads\t{0}", reads.Where(m => m.Locations.Count > 1).Sum(m => m.QueryCount));
                sw.WriteLine("FeatureReads\t{0}", featureReadCount);
            }
            result.Add(infoFile);

            Progress.End();

            return(result);
        }
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            //read regions
            var featureLocations = options.GetSequenceRegions();

            Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count);
            if (featureLocations.Count == 0)
            {
                throw new Exception(string.Format("No coordinate found in file {1}", options.CoordinateFile));
            }

            var trnaLocations    = featureLocations.Where(l => l.Category.Equals(SmallRNAConsts.tRNA)).ToList();
            var mirnaLocations   = featureLocations.Where(l => l.Category.Equals(SmallRNAConsts.miRNA)).ToList();
            var notTrnaLocations = featureLocations.Where(l => !l.Category.Equals(SmallRNAConsts.tRNA)).ToList();

            var resultFilename = options.OutputFile;

            result.Add(resultFilename);

            Progress.SetMessage("Parsing tRNA alignment result ...");

            //Parsing reads
            List <QueryInfo> trnaQueries;
            var trnaReads = ParseCandidates(options.InputFiles, resultFilename, out trnaQueries);

            SmallRNAUtils.InitializeSmallRnaNTA(trnaReads);

            var hasNTA = trnaReads.Any(l => l.NTA.Length > 0);

            List <QueryInfo> otherrnaQueries;
            var otherRNAReads = ParseCandidates(options.OtherFile, resultFilename + ".other", out otherrnaQueries);

            SmallRNAUtils.InitializeSmallRnaNTA(otherRNAReads);

            var featureGroups = new List <FeatureItemGroup>();
            var mappedfile    = resultFilename + ".mapped.xml";

            if (File.Exists(mappedfile) && options.NotOverwrite)
            {
                Progress.SetMessage("Reading mapped feature items...");
                featureGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedfile);
            }
            else
            {
                Progress.SetMessage("Mapping to tRNA...");

                //Draw tRNA mapping position graph
                Progress.SetMessage("Drawing tRNA position pictures...");
                var tRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".position");
                if (!options.NotOverwrite || !File.Exists(tRNAPositionFile))
                {
                    DrawPositionImage(trnaReads, trnaLocations, "tRNA", tRNAPositionFile);
                }

                //Map reads to tRNA
                MapReadToSequenceRegion(trnaLocations, trnaReads, hasNTA);

                var trnaMapped = trnaLocations.GroupByName();
                trnaMapped.RemoveAll(m => m.GetEstimatedCount() == 0);
                trnaMapped.ForEach(m => m.CombineLocations());

                var trnaGroups = trnaMapped.GroupByIdenticalQuery();
                if (trnaGroups.Count > 0)
                {
                    Progress.SetMessage("Writing tRNA count ...");
                    var trnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.tRNA + ".count");

                    OrderFeatureItemGroup(trnaGroups);
                    new FeatureItemGroupTIGRTCountWriter().WriteToFile(trnaCountFile, trnaGroups);
                    result.Add(trnaCountFile);

                    featureGroups.AddRange(trnaGroups);
                }

                //Get all queries mapped to tRNA
                var tRNAreads = new HashSet <string>(from read in SmallRNAUtils.GetMappedReads(trnaLocations)
                                                     select read.OriginalQname);

                //Remove all reads mapped to tRNA
                otherRNAReads.RemoveAll(m => tRNAreads.Contains(m.OriginalQname));

                //Draw miRNA mapping position graph
                Progress.SetMessage("Drawing miRNA position pictures...");
                var miRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".position");
                if (!options.NotOverwrite || !File.Exists(miRNAPositionFile))
                {
                    DrawPositionImage(otherRNAReads, mirnaLocations, "miRNA", miRNAPositionFile);
                }

                //Map reads to not tRNA
                MapReadToSequenceRegion(notTrnaLocations, otherRNAReads, hasNTA);

                var notTrnaMapped = notTrnaLocations.GroupByName();
                notTrnaMapped.RemoveAll(m => m.GetEstimatedCount() == 0);
                notTrnaMapped.ForEach(m => m.CombineLocations());

                var mirnaGroups = notTrnaMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence();
                if (mirnaGroups.Count > 0)
                {
                    Progress.SetMessage("writing miRNA count ...");
                    OrderFeatureItemGroup(mirnaGroups);

                    var mirnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.miRNA + ".count");
                    new SmallRNACountMicroRNAWriter(options.Offsets).WriteToFile(mirnaCountFile, mirnaGroups);
                    result.Add(mirnaCountFile);
                    featureGroups.AddRange(mirnaGroups);
                }

                var otherGroups = notTrnaMapped.Where(m => !m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupByIdenticalQuery();
                if (otherGroups.Count > 0)
                {
                    Progress.SetMessage("writing other smallRNA count ...");
                    var otherCountFile = Path.ChangeExtension(resultFilename, ".other.count");

                    OrderFeatureItemGroup(otherGroups);
                    new FeatureItemGroupTIGRTCountWriter().WriteToFile(otherCountFile, otherGroups);
                    result.Add(otherCountFile);

                    featureGroups.AddRange(otherGroups);
                }

                Progress.SetMessage("writing all smallRNA count ...");
                new FeatureItemGroupTIGRTCountWriter().WriteToFile(resultFilename, featureGroups);
                result.Add(resultFilename);

                Progress.SetMessage("writing mapping details...");
                new FeatureItemGroupXmlFormat().WriteToFile(mappedfile, featureGroups);
                result.Add(mappedfile);
            }

            var readSummary = GetReadSummary(featureGroups, new HashSet <string>(), trnaReads.Union(otherRNAReads).ToList(), trnaQueries.Union(otherrnaQueries).ToList());

            var totalQueryCount  = (from q in trnaQueries.Union(otherrnaQueries) select q.Name.StringBefore(SmallRNAConsts.NTA_TAG)).Distinct().Sum(m => Counts.GetCount(m));
            var totalMappedCount = (from q in trnaReads select q.OriginalQname).Union(from q in otherRNAReads select q.OriginalQname).Distinct().Sum(m => Counts.GetCount(m));

            var infoFile = Path.ChangeExtension(resultFilename, ".info");

            WriteSummaryFile(infoFile, readSummary, featureGroups);
            result.Add(infoFile);

            Progress.End();

            return(result);
        }