public override void MapReadToFeature(List <FeatureLocation> features, Dictionary <string, Dictionary <char, List <SAMAlignedLocation> > > chrStrandReadMap) { base.MapReadToFeature(features, chrStrandReadMap); if (allowNTA) { //NTA has to be at the end of tRNA. features.RemoveAll(m => { m.SamLocations.RemoveAll(l => { var loc = l.SamLocation; if (!loc.Parent.Qname.Contains(SmallRNAConsts.NTA_TAG)) { return(false); } var nta = loc.Parent.Qname.StringAfter(SmallRNAConsts.NTA_TAG); if (nta.Length == 0) { return(false); } if (loc.End != l.FeatureLocation.End || !allowedNTAs.Contains(nta)) { return(true); } if (nta.Equals("CC")) { return(!cca.Contains(loc.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG))); } return(false); }); return(m.SamLocations.Count == 0); }); SmallRNAUtils.SelectBestMatchedNTA(features); } else { //all queries with NTA will be removed. features.RemoveAll(m => { m.SamLocations.RemoveAll(l => { var loc = l.SamLocation; if (!loc.Parent.Qname.Contains(SmallRNAConsts.NTA_TAG)) { return(false); } var nta = loc.Parent.Qname.StringAfter(SmallRNAConsts.NTA_TAG); return(nta.Length > 0); }); return(m.SamLocations.Count == 0); }); } }
public virtual void MapReadToFeatureAndRemoveFromMap(List <FeatureLocation> allFeatures, Dictionary <string, Dictionary <char, List <SAMAlignedLocation> > > chrStrandReadMap) { var features = allFeatures.Where(l => Accept(l)).ToList(); Progress.SetMessage("Mapping reads to {0} {1} entries.", features.Count, MapperName); if (features.Count > 0) { MapReadToFeature(features, chrStrandReadMap); var reads = SmallRNAUtils.GetMappedReads(features); Progress.SetMessage("There are {0} SAM entries mapped to {1} entries.", reads.Count, MapperName); SmallRNAUtils.RemoveReadsFromMap(chrStrandReadMap, reads); } else { Progress.SetMessage("There are 0 SAM entries mapped to {0} entries.", MapperName); } }
public override void MapReadToFeature(List <FeatureLocation> features, Dictionary <string, Dictionary <char, List <SAMAlignedLocation> > > chrStrandReadMap) { base.MapReadToFeature(features, chrStrandReadMap); //For each query, keep the one with the best offset var fsls = (from m in features from l in m.SamLocations select l).GroupBy(m => m.SamLocation.Parent).ToList().ConvertAll(m => m.ToArray()); //filter offset by priority foreach (var fsl in fsls) { if (fsl.Count() == 1) { continue; } var bestOffset = fsl.Min(m => Options.Offsets.IndexOf(m.Offset)); foreach (var f in fsl) { if (Options.Offsets.IndexOf(f.Offset) != bestOffset) { f.FeatureLocation.SamLocations.Remove(f); f.SamLocation.Features.Remove(f.FeatureLocation); } } } //filter NTA if (hasNTA) { //remove all CCAA NTA which is designed for tRNA features.RemoveAll(m => { m.SamLocations.RemoveAll(s => s.SamLocation.Parent.Qname.StringAfter(SmallRNAConsts.NTA_TAG).Equals("CCAA")); return(m.SamLocations.Count == 0); }); SmallRNAUtils.SelectBestMatchedNTA(features); } }
public override IEnumerable <string> Process() { var countfiles = options.GetCountFiles(); Dictionary <string, FeatureItem> featureMap = new Dictionary <string, FeatureItem>(); List <string> samples = new List <string>(); for (int i = 0; i < countfiles.Count; i++) { var file = countfiles[i]; samples.Add(file.Name); Progress.SetMessage("Reading {0}/{1} {2}...", i + 1, countfiles.Count, file.File); var mapped = new FeatureItemGroupXmlFormat().ReadFromFile(file.File); mapped.GetQueries().ForEach(m => m.Sample = file.Name); //merge data by feature foreach (var group in mapped) { foreach (var curFeature in group) { FeatureItem existFeature; if (featureMap.TryGetValue(curFeature.Name, out existFeature)) { var existLocationMap = existFeature.Locations.ToDictionary(l => l.GetLocation()); foreach (var curLocation in curFeature.Locations) { FeatureLocation existLocation; if (existLocationMap.TryGetValue(curLocation.GetLocation(), out existLocation)) { existLocation.SamLocations.AddRange(curLocation.SamLocations); } else { existFeature.Locations.Add(curLocation); } } } else // add to feature map { featureMap[curFeature.Name] = curFeature; } } } } var features = featureMap.Values.ToList(); samples.Sort(); var allGroups = new List <FeatureItemGroup>(); var result = new List <string>(); var allTRNA = features.All(l => l.Name.StartsWith(SmallRNAConsts.tRNA)); if (!options.NoCategory) { if (!allTRNA) { //output miRNA Progress.SetMessage("Grouping microRNA by sequence ..."); var miRNAGroup = features.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); //Progress.SetMessage("Writing microRNA xml file ..."); //new FeatureItemGroupXmlFormat().WriteToFile(options.OutputFile + ".miRNA.xml", miRNAGroup); Progress.SetMessage("Writing microRNA ..."); var miRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".count"); result.AddRange(new MirnaNTACountTableWriter().WriteToFile(miRNAFile, miRNAGroup, samples, SmallRNAConsts.miRNA + ":")); new SmallRNAPositionWriter().WriteToFile(miRNAFile + ".position", miRNAGroup); allGroups.AddRange(miRNAGroup); } //output tRNA Progress.SetMessage("Grouping tRNA by anticodon ..."); var tRNAs = features.Where(m => m.Name.StartsWith(SmallRNAConsts.tRNA)).ToList(); var tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTrnaAnticodon).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); var tRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".count"); Progress.SetMessage("Writing tRNA anticodon ..."); result.AddRange(new TrnaNTACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":")); Progress.SetMessage("Writing tRNA anticodon position ..."); new SmallRNAPositionWriter(m => SmallRNAUtils.GetTrnaAnticodon(m[0]), positionByPercentage: true).WriteToFile(tRNAFile + ".position", tRNAGroup); new SmallRNAStartPositionWriter(m => SmallRNAUtils.GetTrnaAnticodon(m[0])).WriteToFile(tRNAFile + ".startpos", tRNAGroup); allGroups.AddRange(tRNAGroup); //output tRNA aminoacid Progress.SetMessage("Grouping tRNA by amino acid ..."); tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTrnaAminoacid, true).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); tRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".aminoacid.count"); Progress.SetMessage("Writing tRNA amino acid ..."); result.AddRange(new SmallRNACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":")); Progress.SetMessage("Writing tRNA aminoacid position ..."); new SmallRNAPositionWriter(m => SmallRNAUtils.GetTrnaAminoacid(m[0]), positionByPercentage: true).WriteToFile(tRNAFile + ".position", tRNAGroup); if (!allTRNA) { var exportBiotypes = SmallRNAUtils.GetOutputBiotypes(options); foreach (var biotype in exportBiotypes) { OutputBiotype(samples, features, allGroups, result, biotype, m => m.StartsWith(biotype), !biotype.Equals(SmallRNABiotype.rRNA.ToString()), !biotype.Equals(SmallRNABiotype.rRNA.ToString())); } var biotypes = new[] { SmallRNAConsts.miRNA, SmallRNAConsts.tRNA }.Union(exportBiotypes).ToList(); OutputBiotype(samples, features, allGroups, result, "", m => !biotypes.Any(l => m.StartsWith(l)), false, false); } } else { Progress.SetMessage("Grouping features by identical query ..."); allGroups = features.GroupByIdenticalQuery().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); } //output all smallRNA Progress.SetMessage("Writing all smallRNA ..."); result.AddRange(new SmallRNACountTableWriter().WriteToFile(options.OutputFile, allGroups, samples, "")); Progress.SetMessage("Done ..."); return(result); }
public override IEnumerable <string> Process() { var result = new List <string>(); //read regions var featureLocations = options.GetSequenceRegions(); Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count); if (featureLocations.Count == 0) { throw new Exception(string.Format("No coordinate found in file {0}", options.CoordinateFile)); } var fGroups = featureLocations.GroupBy(l => l.Category).OrderByDescending(l => l.Count()).ToList(); foreach (var fg in fGroups) { Console.WriteLine("{0} = {1}", fg.Key, fg.Count()); } var featureChroms = new HashSet <string>(from feature in featureLocations select feature.Seqname); var resultFilename = options.OutputFile; result.Add(resultFilename); HashSet <string> cca = new HashSet <string>(); if (File.Exists(options.CCAFile)) { cca = new HashSet <string>(File.ReadAllLines(options.CCAFile)); } //parsing reads List <QueryInfo> totalQueries; var reads = ParseCandidates(options.InputFiles, resultFilename, out totalQueries); if (reads.Count == 0) { throw new ArgumentException("No read found in file " + options.InputFiles.Merge(",")); } HashSet <string> excludeQueries = new HashSet <string>(); if (!string.IsNullOrEmpty(options.ExcludeXml)) { Progress.SetMessage("Excluding queries in {0} ...", options.ExcludeXml); excludeQueries = new HashSet <string>(from q in MappedItemGroupXmlFileFormat.ReadQueries(options.ExcludeXml) select q.StringBefore(SmallRNAConsts.NTA_TAG)); reads.RemoveAll(m => excludeQueries.Contains(m.Locations.First().Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG))); Progress.SetMessage("Total candidate {0} for mapping ...", reads.Count); } var hasMicroRnaNTA = reads.Any(l => l.NTA.Length > 0); var hasTrnaNTA = hasMicroRnaNTA || File.Exists(options.CCAFile); if (!options.NoCategory) { //First of all, draw candidate mapping position graph var miRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".candidates.position"); if (!options.NotOverwrite || !File.Exists(miRNAPositionFile)) { Progress.SetMessage("Drawing microRNA candidates position pictures..."); var notNTAreads = hasMicroRnaNTA ? reads.Where(m => m.NTA.Length == 0).ToList() : reads; DrawPositionImage(notNTAreads, featureLocations.Where(m => m.Category.Equals(SmallRNAConsts.miRNA)).ToList(), SmallRNABiotype.miRNA.ToString(), miRNAPositionFile); } } var featureGroups = new List <FeatureItemGroup>(); var mappedfile = resultFilename + ".mapped.xml"; if (File.Exists(mappedfile) && options.NotOverwrite) { Progress.SetMessage("Reading mapped feature items..."); featureGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedfile); } else { Progress.SetMessage("Mapping feature items..."); //mapping reads to features based on miRNA, tRNA, mt_tRNA and other smallRNA priority MapReadToSequenceRegion(featureLocations, reads, cca, hasMicroRnaNTA, hasTrnaNTA); var featureMapped = featureLocations.GroupByName(); featureMapped.RemoveAll(m => m.GetEstimatedCount() == 0); featureMapped.ForEach(m => m.CombineLocations()); if (options.NoCategory) { featureGroups = featureMapped.GroupByIdenticalQuery(); } else { var mirnaGroups = featureMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence(); if (mirnaGroups.Count > 0) { OrderFeatureItemGroup(mirnaGroups); Progress.SetMessage("writing miRNA count ..."); var mirnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.miRNA + ".count"); new SmallRNACountMicroRNAWriter(options.Offsets).WriteToFile(mirnaCountFile, mirnaGroups); result.Add(mirnaCountFile); featureGroups.AddRange(mirnaGroups); var positionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".position"); SmallRNAMappedPositionBuilder.Build(mirnaGroups, Path.GetFileNameWithoutExtension(options.OutputFile), positionFile, m => m[0].Name.StringAfter(":")); } mirnaGroups.Clear(); var trnaCodeGroups = featureMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.tRNA)).GroupByFunction(SmallRNAUtils.GetTrnaAnticodon, false); if (trnaCodeGroups.Count > 0) { OrderFeatureItemGroup(trnaCodeGroups); Progress.SetMessage("writing tRNA code count ..."); var trnaCodeCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.tRNA + ".count"); new FeatureItemGroupCountWriter(m => m.DisplayNameWithoutCategory).WriteToFile(trnaCodeCountFile, trnaCodeGroups); result.Add(trnaCodeCountFile); featureGroups.AddRange(trnaCodeGroups); var positionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".position"); SmallRNAMappedPositionBuilder.Build(trnaCodeGroups, Path.GetFileName(options.OutputFile), positionFile, m => m[0].Name.StringAfter(":")); } trnaCodeGroups.Clear(); var otherFeatures = featureMapped.Where(m => !m.Name.StartsWith(SmallRNAConsts.miRNA) && !m.Name.StartsWith(SmallRNAConsts.tRNA)).ToList(); var exportBiotypes = SmallRNAUtils.GetOutputBiotypes(options); foreach (var biotype in exportBiotypes) { WriteGroups(result, resultFilename, featureGroups, otherFeatures, biotype); } var leftFeatures = otherFeatures.Where(l => !exportBiotypes.Any(b => l.Name.StartsWith(b))).ToList(); WriteGroups(result, resultFilename, featureGroups, leftFeatures, null); } Progress.SetMessage("writing all smallRNA count ..."); new FeatureItemGroupCountWriter().WriteToFile(resultFilename, featureGroups); result.Add(resultFilename); Progress.SetMessage("writing mapping details..."); new FeatureItemGroupXmlFormatHand().WriteToFile(mappedfile, featureGroups); } var readSummary = GetReadSummary(featureGroups, excludeQueries, reads, totalQueries); WriteInfoFile(result, resultFilename, readSummary, featureGroups); result.Add(mappedfile); Progress.End(); return(result); }
public override IEnumerable <string> Process() { var result = new List <string>(); //read regions var featureLocations = options.GetSequenceRegions(); Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count); if (featureLocations.Count == 0) { throw new Exception(string.Format("No coordinate found in file {1}", options.CoordinateFile)); } var trnaLocations = featureLocations.Where(l => l.Category.Equals(SmallRNAConsts.tRNA)).ToList(); var mirnaLocations = featureLocations.Where(l => l.Category.Equals(SmallRNAConsts.miRNA)).ToList(); var notTrnaLocations = featureLocations.Where(l => !l.Category.Equals(SmallRNAConsts.tRNA)).ToList(); var resultFilename = options.OutputFile; result.Add(resultFilename); Progress.SetMessage("Parsing tRNA alignment result ..."); //Parsing reads List <QueryInfo> trnaQueries; var trnaReads = ParseCandidates(options.InputFiles, resultFilename, out trnaQueries); SmallRNAUtils.InitializeSmallRnaNTA(trnaReads); var hasNTA = trnaReads.Any(l => l.NTA.Length > 0); List <QueryInfo> otherrnaQueries; var otherRNAReads = ParseCandidates(options.OtherFile, resultFilename + ".other", out otherrnaQueries); SmallRNAUtils.InitializeSmallRnaNTA(otherRNAReads); var featureGroups = new List <FeatureItemGroup>(); var mappedfile = resultFilename + ".mapped.xml"; if (File.Exists(mappedfile) && options.NotOverwrite) { Progress.SetMessage("Reading mapped feature items..."); featureGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedfile); } else { Progress.SetMessage("Mapping to tRNA..."); //Draw tRNA mapping position graph Progress.SetMessage("Drawing tRNA position pictures..."); var tRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".position"); if (!options.NotOverwrite || !File.Exists(tRNAPositionFile)) { DrawPositionImage(trnaReads, trnaLocations, "tRNA", tRNAPositionFile); } //Map reads to tRNA MapReadToSequenceRegion(trnaLocations, trnaReads, hasNTA); var trnaMapped = trnaLocations.GroupByName(); trnaMapped.RemoveAll(m => m.GetEstimatedCount() == 0); trnaMapped.ForEach(m => m.CombineLocations()); var trnaGroups = trnaMapped.GroupByIdenticalQuery(); if (trnaGroups.Count > 0) { Progress.SetMessage("Writing tRNA count ..."); var trnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.tRNA + ".count"); OrderFeatureItemGroup(trnaGroups); new FeatureItemGroupTIGRTCountWriter().WriteToFile(trnaCountFile, trnaGroups); result.Add(trnaCountFile); featureGroups.AddRange(trnaGroups); } //Get all queries mapped to tRNA var tRNAreads = new HashSet <string>(from read in SmallRNAUtils.GetMappedReads(trnaLocations) select read.OriginalQname); //Remove all reads mapped to tRNA otherRNAReads.RemoveAll(m => tRNAreads.Contains(m.OriginalQname)); //Draw miRNA mapping position graph Progress.SetMessage("Drawing miRNA position pictures..."); var miRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".position"); if (!options.NotOverwrite || !File.Exists(miRNAPositionFile)) { DrawPositionImage(otherRNAReads, mirnaLocations, "miRNA", miRNAPositionFile); } //Map reads to not tRNA MapReadToSequenceRegion(notTrnaLocations, otherRNAReads, hasNTA); var notTrnaMapped = notTrnaLocations.GroupByName(); notTrnaMapped.RemoveAll(m => m.GetEstimatedCount() == 0); notTrnaMapped.ForEach(m => m.CombineLocations()); var mirnaGroups = notTrnaMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence(); if (mirnaGroups.Count > 0) { Progress.SetMessage("writing miRNA count ..."); OrderFeatureItemGroup(mirnaGroups); var mirnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.miRNA + ".count"); new SmallRNACountMicroRNAWriter(options.Offsets).WriteToFile(mirnaCountFile, mirnaGroups); result.Add(mirnaCountFile); featureGroups.AddRange(mirnaGroups); } var otherGroups = notTrnaMapped.Where(m => !m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupByIdenticalQuery(); if (otherGroups.Count > 0) { Progress.SetMessage("writing other smallRNA count ..."); var otherCountFile = Path.ChangeExtension(resultFilename, ".other.count"); OrderFeatureItemGroup(otherGroups); new FeatureItemGroupTIGRTCountWriter().WriteToFile(otherCountFile, otherGroups); result.Add(otherCountFile); featureGroups.AddRange(otherGroups); } Progress.SetMessage("writing all smallRNA count ..."); new FeatureItemGroupTIGRTCountWriter().WriteToFile(resultFilename, featureGroups); result.Add(resultFilename); Progress.SetMessage("writing mapping details..."); new FeatureItemGroupXmlFormat().WriteToFile(mappedfile, featureGroups); result.Add(mappedfile); } var readSummary = GetReadSummary(featureGroups, new HashSet <string>(), trnaReads.Union(otherRNAReads).ToList(), trnaQueries.Union(otherrnaQueries).ToList()); var totalQueryCount = (from q in trnaQueries.Union(otherrnaQueries) select q.Name.StringBefore(SmallRNAConsts.NTA_TAG)).Distinct().Sum(m => Counts.GetCount(m)); var totalMappedCount = (from q in trnaReads select q.OriginalQname).Union(from q in otherRNAReads select q.OriginalQname).Distinct().Sum(m => Counts.GetCount(m)); var infoFile = Path.ChangeExtension(resultFilename, ".info"); WriteSummaryFile(infoFile, readSummary, featureGroups); result.Add(infoFile); Progress.End(); return(result); }