예제 #1
0
        public override void MapReadToFeature(List <FeatureLocation> features, Dictionary <string, Dictionary <char, List <SAMAlignedLocation> > > chrStrandReadMap)
        {
            base.MapReadToFeature(features, chrStrandReadMap);

            if (allowNTA)
            {
                //NTA has to be at the end of tRNA.
                features.RemoveAll(m =>
                {
                    m.SamLocations.RemoveAll(l =>
                    {
                        var loc = l.SamLocation;
                        if (!loc.Parent.Qname.Contains(SmallRNAConsts.NTA_TAG))
                        {
                            return(false);
                        }

                        var nta = loc.Parent.Qname.StringAfter(SmallRNAConsts.NTA_TAG);
                        if (nta.Length == 0)
                        {
                            return(false);
                        }

                        if (loc.End != l.FeatureLocation.End || !allowedNTAs.Contains(nta))
                        {
                            return(true);
                        }

                        if (nta.Equals("CC"))
                        {
                            return(!cca.Contains(loc.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG)));
                        }
                        return(false);
                    });
                    return(m.SamLocations.Count == 0);
                });

                SmallRNAUtils.SelectBestMatchedNTA(features);
            }
            else
            {
                //all queries with NTA will be removed.
                features.RemoveAll(m =>
                {
                    m.SamLocations.RemoveAll(l =>
                    {
                        var loc = l.SamLocation;
                        if (!loc.Parent.Qname.Contains(SmallRNAConsts.NTA_TAG))
                        {
                            return(false);
                        }

                        var nta = loc.Parent.Qname.StringAfter(SmallRNAConsts.NTA_TAG);
                        return(nta.Length > 0);
                    });
                    return(m.SamLocations.Count == 0);
                });
            }
        }
예제 #2
0
        public virtual void MapReadToFeatureAndRemoveFromMap(List <FeatureLocation> allFeatures, Dictionary <string, Dictionary <char, List <SAMAlignedLocation> > > chrStrandReadMap)
        {
            var features = allFeatures.Where(l => Accept(l)).ToList();

            Progress.SetMessage("Mapping reads to {0} {1} entries.", features.Count, MapperName);
            if (features.Count > 0)
            {
                MapReadToFeature(features, chrStrandReadMap);

                var reads = SmallRNAUtils.GetMappedReads(features);
                Progress.SetMessage("There are {0} SAM entries mapped to {1} entries.", reads.Count, MapperName);

                SmallRNAUtils.RemoveReadsFromMap(chrStrandReadMap, reads);
            }
            else
            {
                Progress.SetMessage("There are 0 SAM entries mapped to {0} entries.", MapperName);
            }
        }
예제 #3
0
        public override void MapReadToFeature(List <FeatureLocation> features, Dictionary <string, Dictionary <char, List <SAMAlignedLocation> > > chrStrandReadMap)
        {
            base.MapReadToFeature(features, chrStrandReadMap);

            //For each query, keep the one with the best offset
            var fsls = (from m in features
                        from l in m.SamLocations
                        select l).GroupBy(m => m.SamLocation.Parent).ToList().ConvertAll(m => m.ToArray());

            //filter offset by priority
            foreach (var fsl in fsls)
            {
                if (fsl.Count() == 1)
                {
                    continue;
                }

                var bestOffset = fsl.Min(m => Options.Offsets.IndexOf(m.Offset));
                foreach (var f in fsl)
                {
                    if (Options.Offsets.IndexOf(f.Offset) != bestOffset)
                    {
                        f.FeatureLocation.SamLocations.Remove(f);
                        f.SamLocation.Features.Remove(f.FeatureLocation);
                    }
                }
            }

            //filter NTA
            if (hasNTA)
            {
                //remove all CCAA NTA which is designed for tRNA
                features.RemoveAll(m =>
                {
                    m.SamLocations.RemoveAll(s => s.SamLocation.Parent.Qname.StringAfter(SmallRNAConsts.NTA_TAG).Equals("CCAA"));
                    return(m.SamLocations.Count == 0);
                });

                SmallRNAUtils.SelectBestMatchedNTA(features);
            }
        }
        public override IEnumerable <string> Process()
        {
            var countfiles = options.GetCountFiles();

            Dictionary <string, FeatureItem> featureMap = new Dictionary <string, FeatureItem>();
            List <string> samples = new List <string>();

            for (int i = 0; i < countfiles.Count; i++)
            {
                var file = countfiles[i];
                samples.Add(file.Name);

                Progress.SetMessage("Reading {0}/{1} {2}...", i + 1, countfiles.Count, file.File);
                var mapped = new FeatureItemGroupXmlFormat().ReadFromFile(file.File);
                mapped.GetQueries().ForEach(m => m.Sample = file.Name);

                //merge data by feature
                foreach (var group in mapped)
                {
                    foreach (var curFeature in group)
                    {
                        FeatureItem existFeature;
                        if (featureMap.TryGetValue(curFeature.Name, out existFeature))
                        {
                            var existLocationMap = existFeature.Locations.ToDictionary(l => l.GetLocation());
                            foreach (var curLocation in curFeature.Locations)
                            {
                                FeatureLocation existLocation;
                                if (existLocationMap.TryGetValue(curLocation.GetLocation(), out existLocation))
                                {
                                    existLocation.SamLocations.AddRange(curLocation.SamLocations);
                                }
                                else
                                {
                                    existFeature.Locations.Add(curLocation);
                                }
                            }
                        }
                        else // add to feature map
                        {
                            featureMap[curFeature.Name] = curFeature;
                        }
                    }
                }
            }

            var features = featureMap.Values.ToList();

            samples.Sort();

            var allGroups = new List <FeatureItemGroup>();
            var result    = new List <string>();

            var allTRNA = features.All(l => l.Name.StartsWith(SmallRNAConsts.tRNA));

            if (!options.NoCategory)
            {
                if (!allTRNA)
                {
                    //output miRNA
                    Progress.SetMessage("Grouping microRNA by sequence ...");
                    var miRNAGroup = features.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList();

                    //Progress.SetMessage("Writing microRNA xml file ...");
                    //new FeatureItemGroupXmlFormat().WriteToFile(options.OutputFile + ".miRNA.xml", miRNAGroup);

                    Progress.SetMessage("Writing microRNA ...");
                    var miRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".count");
                    result.AddRange(new MirnaNTACountTableWriter().WriteToFile(miRNAFile, miRNAGroup, samples, SmallRNAConsts.miRNA + ":"));
                    new SmallRNAPositionWriter().WriteToFile(miRNAFile + ".position", miRNAGroup);
                    allGroups.AddRange(miRNAGroup);
                }

                //output tRNA
                Progress.SetMessage("Grouping tRNA by anticodon ...");
                var tRNAs     = features.Where(m => m.Name.StartsWith(SmallRNAConsts.tRNA)).ToList();
                var tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTrnaAnticodon).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList();
                var tRNAFile  = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".count");
                Progress.SetMessage("Writing tRNA anticodon ...");
                result.AddRange(new TrnaNTACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":"));
                Progress.SetMessage("Writing tRNA anticodon position ...");
                new SmallRNAPositionWriter(m => SmallRNAUtils.GetTrnaAnticodon(m[0]), positionByPercentage: true).WriteToFile(tRNAFile + ".position", tRNAGroup);
                new SmallRNAStartPositionWriter(m => SmallRNAUtils.GetTrnaAnticodon(m[0])).WriteToFile(tRNAFile + ".startpos", tRNAGroup);
                allGroups.AddRange(tRNAGroup);

                //output tRNA aminoacid
                Progress.SetMessage("Grouping tRNA by amino acid ...");
                tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTrnaAminoacid, true).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList();
                tRNAFile  = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".aminoacid.count");
                Progress.SetMessage("Writing tRNA amino acid ...");
                result.AddRange(new SmallRNACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":"));
                Progress.SetMessage("Writing tRNA aminoacid position ...");
                new SmallRNAPositionWriter(m => SmallRNAUtils.GetTrnaAminoacid(m[0]), positionByPercentage: true).WriteToFile(tRNAFile + ".position", tRNAGroup);

                if (!allTRNA)
                {
                    var exportBiotypes = SmallRNAUtils.GetOutputBiotypes(options);
                    foreach (var biotype in exportBiotypes)
                    {
                        OutputBiotype(samples, features, allGroups, result, biotype, m => m.StartsWith(biotype), !biotype.Equals(SmallRNABiotype.rRNA.ToString()), !biotype.Equals(SmallRNABiotype.rRNA.ToString()));
                    }

                    var biotypes = new[] { SmallRNAConsts.miRNA, SmallRNAConsts.tRNA }.Union(exportBiotypes).ToList();
                    OutputBiotype(samples, features, allGroups, result, "", m => !biotypes.Any(l => m.StartsWith(l)), false, false);
                }
            }
            else
            {
                Progress.SetMessage("Grouping features by identical query ...");
                allGroups = features.GroupByIdenticalQuery().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList();
            }

            //output all smallRNA
            Progress.SetMessage("Writing all smallRNA ...");
            result.AddRange(new SmallRNACountTableWriter().WriteToFile(options.OutputFile, allGroups, samples, ""));

            Progress.SetMessage("Done ...");
            return(result);
        }
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            //read regions
            var featureLocations = options.GetSequenceRegions();

            Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count);
            if (featureLocations.Count == 0)
            {
                throw new Exception(string.Format("No coordinate found in file {0}", options.CoordinateFile));
            }

            var fGroups = featureLocations.GroupBy(l => l.Category).OrderByDescending(l => l.Count()).ToList();

            foreach (var fg in fGroups)
            {
                Console.WriteLine("{0} = {1}", fg.Key, fg.Count());
            }

            var featureChroms = new HashSet <string>(from feature in featureLocations
                                                     select feature.Seqname);

            var resultFilename = options.OutputFile;

            result.Add(resultFilename);

            HashSet <string> cca = new HashSet <string>();

            if (File.Exists(options.CCAFile))
            {
                cca = new HashSet <string>(File.ReadAllLines(options.CCAFile));
            }

            //parsing reads
            List <QueryInfo> totalQueries;
            var reads = ParseCandidates(options.InputFiles, resultFilename, out totalQueries);

            if (reads.Count == 0)
            {
                throw new ArgumentException("No read found in file " + options.InputFiles.Merge(","));
            }

            HashSet <string> excludeQueries = new HashSet <string>();

            if (!string.IsNullOrEmpty(options.ExcludeXml))
            {
                Progress.SetMessage("Excluding queries in {0} ...", options.ExcludeXml);
                excludeQueries = new HashSet <string>(from q in MappedItemGroupXmlFileFormat.ReadQueries(options.ExcludeXml)
                                                      select q.StringBefore(SmallRNAConsts.NTA_TAG));
                reads.RemoveAll(m => excludeQueries.Contains(m.Locations.First().Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG)));
                Progress.SetMessage("Total candidate {0} for mapping ...", reads.Count);
            }

            var hasMicroRnaNTA = reads.Any(l => l.NTA.Length > 0);

            var hasTrnaNTA = hasMicroRnaNTA || File.Exists(options.CCAFile);

            if (!options.NoCategory)
            {
                //First of all, draw candidate mapping position graph
                var miRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".candidates.position");
                if (!options.NotOverwrite || !File.Exists(miRNAPositionFile))
                {
                    Progress.SetMessage("Drawing microRNA candidates position pictures...");
                    var notNTAreads = hasMicroRnaNTA ? reads.Where(m => m.NTA.Length == 0).ToList() : reads;
                    DrawPositionImage(notNTAreads, featureLocations.Where(m => m.Category.Equals(SmallRNAConsts.miRNA)).ToList(), SmallRNABiotype.miRNA.ToString(), miRNAPositionFile);
                }
            }

            var featureGroups = new List <FeatureItemGroup>();
            var mappedfile    = resultFilename + ".mapped.xml";

            if (File.Exists(mappedfile) && options.NotOverwrite)
            {
                Progress.SetMessage("Reading mapped feature items...");
                featureGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedfile);
            }
            else
            {
                Progress.SetMessage("Mapping feature items...");

                //mapping reads to features based on miRNA, tRNA, mt_tRNA and other smallRNA priority
                MapReadToSequenceRegion(featureLocations, reads, cca, hasMicroRnaNTA, hasTrnaNTA);

                var featureMapped = featureLocations.GroupByName();
                featureMapped.RemoveAll(m => m.GetEstimatedCount() == 0);
                featureMapped.ForEach(m => m.CombineLocations());

                if (options.NoCategory)
                {
                    featureGroups = featureMapped.GroupByIdenticalQuery();
                }
                else
                {
                    var mirnaGroups = featureMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence();
                    if (mirnaGroups.Count > 0)
                    {
                        OrderFeatureItemGroup(mirnaGroups);

                        Progress.SetMessage("writing miRNA count ...");

                        var mirnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.miRNA + ".count");
                        new SmallRNACountMicroRNAWriter(options.Offsets).WriteToFile(mirnaCountFile, mirnaGroups);
                        result.Add(mirnaCountFile);
                        featureGroups.AddRange(mirnaGroups);

                        var positionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".position");
                        SmallRNAMappedPositionBuilder.Build(mirnaGroups, Path.GetFileNameWithoutExtension(options.OutputFile), positionFile, m => m[0].Name.StringAfter(":"));
                    }
                    mirnaGroups.Clear();

                    var trnaCodeGroups = featureMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.tRNA)).GroupByFunction(SmallRNAUtils.GetTrnaAnticodon, false);
                    if (trnaCodeGroups.Count > 0)
                    {
                        OrderFeatureItemGroup(trnaCodeGroups);

                        Progress.SetMessage("writing tRNA code count ...");
                        var trnaCodeCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.tRNA + ".count");

                        new FeatureItemGroupCountWriter(m => m.DisplayNameWithoutCategory).WriteToFile(trnaCodeCountFile, trnaCodeGroups);
                        result.Add(trnaCodeCountFile);

                        featureGroups.AddRange(trnaCodeGroups);

                        var positionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".position");
                        SmallRNAMappedPositionBuilder.Build(trnaCodeGroups, Path.GetFileName(options.OutputFile), positionFile, m => m[0].Name.StringAfter(":"));
                    }
                    trnaCodeGroups.Clear();

                    var otherFeatures  = featureMapped.Where(m => !m.Name.StartsWith(SmallRNAConsts.miRNA) && !m.Name.StartsWith(SmallRNAConsts.tRNA)).ToList();
                    var exportBiotypes = SmallRNAUtils.GetOutputBiotypes(options);
                    foreach (var biotype in exportBiotypes)
                    {
                        WriteGroups(result, resultFilename, featureGroups, otherFeatures, biotype);
                    }

                    var leftFeatures = otherFeatures.Where(l => !exportBiotypes.Any(b => l.Name.StartsWith(b))).ToList();
                    WriteGroups(result, resultFilename, featureGroups, leftFeatures, null);
                }

                Progress.SetMessage("writing all smallRNA count ...");
                new FeatureItemGroupCountWriter().WriteToFile(resultFilename, featureGroups);
                result.Add(resultFilename);

                Progress.SetMessage("writing mapping details...");
                new FeatureItemGroupXmlFormatHand().WriteToFile(mappedfile, featureGroups);
            }

            var readSummary = GetReadSummary(featureGroups, excludeQueries, reads, totalQueries);

            WriteInfoFile(result, resultFilename, readSummary, featureGroups);
            result.Add(mappedfile);
            Progress.End();

            return(result);
        }
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            //read regions
            var featureLocations = options.GetSequenceRegions();

            Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count);
            if (featureLocations.Count == 0)
            {
                throw new Exception(string.Format("No coordinate found in file {1}", options.CoordinateFile));
            }

            var trnaLocations    = featureLocations.Where(l => l.Category.Equals(SmallRNAConsts.tRNA)).ToList();
            var mirnaLocations   = featureLocations.Where(l => l.Category.Equals(SmallRNAConsts.miRNA)).ToList();
            var notTrnaLocations = featureLocations.Where(l => !l.Category.Equals(SmallRNAConsts.tRNA)).ToList();

            var resultFilename = options.OutputFile;

            result.Add(resultFilename);

            Progress.SetMessage("Parsing tRNA alignment result ...");

            //Parsing reads
            List <QueryInfo> trnaQueries;
            var trnaReads = ParseCandidates(options.InputFiles, resultFilename, out trnaQueries);

            SmallRNAUtils.InitializeSmallRnaNTA(trnaReads);

            var hasNTA = trnaReads.Any(l => l.NTA.Length > 0);

            List <QueryInfo> otherrnaQueries;
            var otherRNAReads = ParseCandidates(options.OtherFile, resultFilename + ".other", out otherrnaQueries);

            SmallRNAUtils.InitializeSmallRnaNTA(otherRNAReads);

            var featureGroups = new List <FeatureItemGroup>();
            var mappedfile    = resultFilename + ".mapped.xml";

            if (File.Exists(mappedfile) && options.NotOverwrite)
            {
                Progress.SetMessage("Reading mapped feature items...");
                featureGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedfile);
            }
            else
            {
                Progress.SetMessage("Mapping to tRNA...");

                //Draw tRNA mapping position graph
                Progress.SetMessage("Drawing tRNA position pictures...");
                var tRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".position");
                if (!options.NotOverwrite || !File.Exists(tRNAPositionFile))
                {
                    DrawPositionImage(trnaReads, trnaLocations, "tRNA", tRNAPositionFile);
                }

                //Map reads to tRNA
                MapReadToSequenceRegion(trnaLocations, trnaReads, hasNTA);

                var trnaMapped = trnaLocations.GroupByName();
                trnaMapped.RemoveAll(m => m.GetEstimatedCount() == 0);
                trnaMapped.ForEach(m => m.CombineLocations());

                var trnaGroups = trnaMapped.GroupByIdenticalQuery();
                if (trnaGroups.Count > 0)
                {
                    Progress.SetMessage("Writing tRNA count ...");
                    var trnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.tRNA + ".count");

                    OrderFeatureItemGroup(trnaGroups);
                    new FeatureItemGroupTIGRTCountWriter().WriteToFile(trnaCountFile, trnaGroups);
                    result.Add(trnaCountFile);

                    featureGroups.AddRange(trnaGroups);
                }

                //Get all queries mapped to tRNA
                var tRNAreads = new HashSet <string>(from read in SmallRNAUtils.GetMappedReads(trnaLocations)
                                                     select read.OriginalQname);

                //Remove all reads mapped to tRNA
                otherRNAReads.RemoveAll(m => tRNAreads.Contains(m.OriginalQname));

                //Draw miRNA mapping position graph
                Progress.SetMessage("Drawing miRNA position pictures...");
                var miRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".position");
                if (!options.NotOverwrite || !File.Exists(miRNAPositionFile))
                {
                    DrawPositionImage(otherRNAReads, mirnaLocations, "miRNA", miRNAPositionFile);
                }

                //Map reads to not tRNA
                MapReadToSequenceRegion(notTrnaLocations, otherRNAReads, hasNTA);

                var notTrnaMapped = notTrnaLocations.GroupByName();
                notTrnaMapped.RemoveAll(m => m.GetEstimatedCount() == 0);
                notTrnaMapped.ForEach(m => m.CombineLocations());

                var mirnaGroups = notTrnaMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence();
                if (mirnaGroups.Count > 0)
                {
                    Progress.SetMessage("writing miRNA count ...");
                    OrderFeatureItemGroup(mirnaGroups);

                    var mirnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.miRNA + ".count");
                    new SmallRNACountMicroRNAWriter(options.Offsets).WriteToFile(mirnaCountFile, mirnaGroups);
                    result.Add(mirnaCountFile);
                    featureGroups.AddRange(mirnaGroups);
                }

                var otherGroups = notTrnaMapped.Where(m => !m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupByIdenticalQuery();
                if (otherGroups.Count > 0)
                {
                    Progress.SetMessage("writing other smallRNA count ...");
                    var otherCountFile = Path.ChangeExtension(resultFilename, ".other.count");

                    OrderFeatureItemGroup(otherGroups);
                    new FeatureItemGroupTIGRTCountWriter().WriteToFile(otherCountFile, otherGroups);
                    result.Add(otherCountFile);

                    featureGroups.AddRange(otherGroups);
                }

                Progress.SetMessage("writing all smallRNA count ...");
                new FeatureItemGroupTIGRTCountWriter().WriteToFile(resultFilename, featureGroups);
                result.Add(resultFilename);

                Progress.SetMessage("writing mapping details...");
                new FeatureItemGroupXmlFormat().WriteToFile(mappedfile, featureGroups);
                result.Add(mappedfile);
            }

            var readSummary = GetReadSummary(featureGroups, new HashSet <string>(), trnaReads.Union(otherRNAReads).ToList(), trnaQueries.Union(otherrnaQueries).ToList());

            var totalQueryCount  = (from q in trnaQueries.Union(otherrnaQueries) select q.Name.StringBefore(SmallRNAConsts.NTA_TAG)).Distinct().Sum(m => Counts.GetCount(m));
            var totalMappedCount = (from q in trnaReads select q.OriginalQname).Union(from q in otherRNAReads select q.OriginalQname).Distinct().Sum(m => Counts.GetCount(m));

            var infoFile = Path.ChangeExtension(resultFilename, ".info");

            WriteSummaryFile(infoFile, readSummary, featureGroups);
            result.Add(infoFile);

            Progress.End();

            return(result);
        }