public override IEnumerable <string> Process() { var countfiles = options.GetCountFiles(); Dictionary <string, FeatureItem> featureMap = new Dictionary <string, FeatureItem>(); List <string> samples = new List <string>(); for (int i = 0; i < countfiles.Count; i++) { var file = countfiles[i]; samples.Add(file.Name); Progress.SetMessage("Reading {0}/{1} {2}...", i + 1, countfiles.Count, file.File); var mapped = new FeatureItemGroupXmlFormat().ReadFromFile(file.File); mapped.GetQueries().ForEach(m => m.Sample = file.Name); //merge data by feature foreach (var group in mapped) { foreach (var curFeature in group) { FeatureItem existFeature; if (featureMap.TryGetValue(curFeature.Name, out existFeature)) { var existLocationMap = existFeature.Locations.ToDictionary(l => l.GetLocation()); foreach (var curLocation in curFeature.Locations) { FeatureLocation existLocation; if (existLocationMap.TryGetValue(curLocation.GetLocation(), out existLocation)) { existLocation.SamLocations.AddRange(curLocation.SamLocations); } else { existFeature.Locations.Add(curLocation); } } } else // add to feature map { featureMap[curFeature.Name] = curFeature; } } } } var features = featureMap.Values.ToList(); samples.Sort(); var allGroups = new List <FeatureItemGroup>(); var result = new List <string>(); var allTRNA = features.All(l => l.Name.StartsWith(SmallRNAConsts.tRNA)); if (!options.NoCategory) { if (!allTRNA) { //output miRNA Progress.SetMessage("Grouping microRNA by sequence ..."); var miRNAGroup = features.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); //Progress.SetMessage("Writing microRNA xml file ..."); //new FeatureItemGroupXmlFormat().WriteToFile(options.OutputFile + ".miRNA.xml", miRNAGroup); Progress.SetMessage("Writing microRNA ..."); var miRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".count"); result.AddRange(new MirnaNTACountTableWriter().WriteToFile(miRNAFile, miRNAGroup, samples, SmallRNAConsts.miRNA + ":")); new SmallRNAPositionWriter().WriteToFile(miRNAFile + ".position", miRNAGroup); allGroups.AddRange(miRNAGroup); } //output tRNA Progress.SetMessage("Grouping tRNA by anticodon ..."); var tRNAs = features.Where(m => m.Name.StartsWith(SmallRNAConsts.tRNA)).ToList(); var tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTrnaAnticodon).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); var tRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".count"); Progress.SetMessage("Writing tRNA anticodon ..."); result.AddRange(new TrnaNTACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":")); Progress.SetMessage("Writing tRNA anticodon position ..."); new SmallRNAPositionWriter(m => SmallRNAUtils.GetTrnaAnticodon(m[0]), positionByPercentage: true).WriteToFile(tRNAFile + ".position", tRNAGroup); new SmallRNAStartPositionWriter(m => SmallRNAUtils.GetTrnaAnticodon(m[0])).WriteToFile(tRNAFile + ".startpos", tRNAGroup); allGroups.AddRange(tRNAGroup); //output tRNA aminoacid Progress.SetMessage("Grouping tRNA by amino acid ..."); tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTrnaAminoacid, true).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); tRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".aminoacid.count"); Progress.SetMessage("Writing tRNA amino acid ..."); result.AddRange(new SmallRNACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":")); Progress.SetMessage("Writing tRNA aminoacid position ..."); new SmallRNAPositionWriter(m => SmallRNAUtils.GetTrnaAminoacid(m[0]), positionByPercentage: true).WriteToFile(tRNAFile + ".position", tRNAGroup); if (!allTRNA) { var exportBiotypes = SmallRNAUtils.GetOutputBiotypes(options); foreach (var biotype in exportBiotypes) { OutputBiotype(samples, features, allGroups, result, biotype, m => m.StartsWith(biotype), !biotype.Equals(SmallRNABiotype.rRNA.ToString()), !biotype.Equals(SmallRNABiotype.rRNA.ToString())); } var biotypes = new[] { SmallRNAConsts.miRNA, SmallRNAConsts.tRNA }.Union(exportBiotypes).ToList(); OutputBiotype(samples, features, allGroups, result, "", m => !biotypes.Any(l => m.StartsWith(l)), false, false); } } else { Progress.SetMessage("Grouping features by identical query ..."); allGroups = features.GroupByIdenticalQuery().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); } //output all smallRNA Progress.SetMessage("Writing all smallRNA ..."); result.AddRange(new SmallRNACountTableWriter().WriteToFile(options.OutputFile, allGroups, samples, "")); Progress.SetMessage("Done ..."); return(result); }
public override IEnumerable <string> Process() { var result = new List <string>(); //read regions var featureLocations = options.GetSequenceRegions(); Progress.SetMessage("There are {0} coordinate entries", featureLocations.Count); if (featureLocations.Count == 0) { throw new Exception(string.Format("No coordinate found in file {0}", options.CoordinateFile)); } var fGroups = featureLocations.GroupBy(l => l.Category).OrderByDescending(l => l.Count()).ToList(); foreach (var fg in fGroups) { Console.WriteLine("{0} = {1}", fg.Key, fg.Count()); } var featureChroms = new HashSet <string>(from feature in featureLocations select feature.Seqname); var resultFilename = options.OutputFile; result.Add(resultFilename); HashSet <string> cca = new HashSet <string>(); if (File.Exists(options.CCAFile)) { cca = new HashSet <string>(File.ReadAllLines(options.CCAFile)); } //parsing reads List <QueryInfo> totalQueries; var reads = ParseCandidates(options.InputFiles, resultFilename, out totalQueries); if (reads.Count == 0) { throw new ArgumentException("No read found in file " + options.InputFiles.Merge(",")); } HashSet <string> excludeQueries = new HashSet <string>(); if (!string.IsNullOrEmpty(options.ExcludeXml)) { Progress.SetMessage("Excluding queries in {0} ...", options.ExcludeXml); excludeQueries = new HashSet <string>(from q in MappedItemGroupXmlFileFormat.ReadQueries(options.ExcludeXml) select q.StringBefore(SmallRNAConsts.NTA_TAG)); reads.RemoveAll(m => excludeQueries.Contains(m.Locations.First().Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG))); Progress.SetMessage("Total candidate {0} for mapping ...", reads.Count); } var hasMicroRnaNTA = reads.Any(l => l.NTA.Length > 0); var hasTrnaNTA = hasMicroRnaNTA || File.Exists(options.CCAFile); if (!options.NoCategory) { //First of all, draw candidate mapping position graph var miRNAPositionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".candidates.position"); if (!options.NotOverwrite || !File.Exists(miRNAPositionFile)) { Progress.SetMessage("Drawing microRNA candidates position pictures..."); var notNTAreads = hasMicroRnaNTA ? reads.Where(m => m.NTA.Length == 0).ToList() : reads; DrawPositionImage(notNTAreads, featureLocations.Where(m => m.Category.Equals(SmallRNAConsts.miRNA)).ToList(), SmallRNABiotype.miRNA.ToString(), miRNAPositionFile); } } var featureGroups = new List <FeatureItemGroup>(); var mappedfile = resultFilename + ".mapped.xml"; if (File.Exists(mappedfile) && options.NotOverwrite) { Progress.SetMessage("Reading mapped feature items..."); featureGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedfile); } else { Progress.SetMessage("Mapping feature items..."); //mapping reads to features based on miRNA, tRNA, mt_tRNA and other smallRNA priority MapReadToSequenceRegion(featureLocations, reads, cca, hasMicroRnaNTA, hasTrnaNTA); var featureMapped = featureLocations.GroupByName(); featureMapped.RemoveAll(m => m.GetEstimatedCount() == 0); featureMapped.ForEach(m => m.CombineLocations()); if (options.NoCategory) { featureGroups = featureMapped.GroupByIdenticalQuery(); } else { var mirnaGroups = featureMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence(); if (mirnaGroups.Count > 0) { OrderFeatureItemGroup(mirnaGroups); Progress.SetMessage("writing miRNA count ..."); var mirnaCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.miRNA + ".count"); new SmallRNACountMicroRNAWriter(options.Offsets).WriteToFile(mirnaCountFile, mirnaGroups); result.Add(mirnaCountFile); featureGroups.AddRange(mirnaGroups); var positionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".position"); SmallRNAMappedPositionBuilder.Build(mirnaGroups, Path.GetFileNameWithoutExtension(options.OutputFile), positionFile, m => m[0].Name.StringAfter(":")); } mirnaGroups.Clear(); var trnaCodeGroups = featureMapped.Where(m => m.Name.StartsWith(SmallRNAConsts.tRNA)).GroupByFunction(SmallRNAUtils.GetTrnaAnticodon, false); if (trnaCodeGroups.Count > 0) { OrderFeatureItemGroup(trnaCodeGroups); Progress.SetMessage("writing tRNA code count ..."); var trnaCodeCountFile = Path.ChangeExtension(resultFilename, "." + SmallRNAConsts.tRNA + ".count"); new FeatureItemGroupCountWriter(m => m.DisplayNameWithoutCategory).WriteToFile(trnaCodeCountFile, trnaCodeGroups); result.Add(trnaCodeCountFile); featureGroups.AddRange(trnaCodeGroups); var positionFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".position"); SmallRNAMappedPositionBuilder.Build(trnaCodeGroups, Path.GetFileName(options.OutputFile), positionFile, m => m[0].Name.StringAfter(":")); } trnaCodeGroups.Clear(); var otherFeatures = featureMapped.Where(m => !m.Name.StartsWith(SmallRNAConsts.miRNA) && !m.Name.StartsWith(SmallRNAConsts.tRNA)).ToList(); var exportBiotypes = SmallRNAUtils.GetOutputBiotypes(options); foreach (var biotype in exportBiotypes) { WriteGroups(result, resultFilename, featureGroups, otherFeatures, biotype); } var leftFeatures = otherFeatures.Where(l => !exportBiotypes.Any(b => l.Name.StartsWith(b))).ToList(); WriteGroups(result, resultFilename, featureGroups, leftFeatures, null); } Progress.SetMessage("writing all smallRNA count ..."); new FeatureItemGroupCountWriter().WriteToFile(resultFilename, featureGroups); result.Add(resultFilename); Progress.SetMessage("writing mapping details..."); new FeatureItemGroupXmlFormatHand().WriteToFile(mappedfile, featureGroups); } var readSummary = GetReadSummary(featureGroups, excludeQueries, reads, totalQueries); WriteInfoFile(result, resultFilename, readSummary, featureGroups); result.Add(mappedfile); Progress.End(); return(result); }