public static List<CoverageRegion> GetSmallRNACoverageRegion(string mappedFeatureXmlFile, string[] includeSmallRNATags = null, string[] excudeSmallRNATags = null) { var smallRNAGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedFeatureXmlFile); if (includeSmallRNATags != null && includeSmallRNATags.Length > 0) { smallRNAGroups.ForEach(m => m.RemoveAll(l => includeSmallRNATags.All(k => !m.Name.StartsWith(k)))); smallRNAGroups.RemoveAll(m => m.Count == 0); } if (excudeSmallRNATags != null && excudeSmallRNATags.Length > 0) { smallRNAGroups.ForEach(m => m.RemoveAll(l => excudeSmallRNATags.Any(k => m.Name.StartsWith(k)))); smallRNAGroups.RemoveAll(m => m.Count == 0); } var result = new List<CoverageRegion>(); foreach (var sg in smallRNAGroups) { //since the items in same group shared same reads, only the first one will be used. var smallRNA = sg[0]; smallRNA.Name = (from g in sg select g.Name).Merge("/"); smallRNA.Locations.RemoveAll(m => m.SamLocations.Count == 0); smallRNA.CombineLocationByMappedReads(); //only first location will be used. var loc = smallRNA.Locations[0]; //coverage in all position will be set as same as total query count var rg = new CoverageRegion(); rg.Name = smallRNA.Name; rg.Seqname = loc.Seqname; rg.Start = loc.Start; rg.End = loc.End; rg.Strand = loc.Strand; rg.Sequence = loc.Sequence; var coverage = (from sloc in loc.SamLocations select sloc.SamLocation.Parent.QueryCount).Sum(); for (int i = 0; i < loc.Length; i++) { rg.Coverages.Add(coverage); } result.Add(rg); } return result; }
public static List<CoverageRegion> GetTargetCoverageRegionFromXml(ITargetBuilderOptions options, IProgressCallback progress) { var result = new List<CoverageRegion>(); var groups = new FeatureItemGroupXmlFormat().ReadFromFile(options.TargetFile); progress.SetMessage("Total {0} potential target group read from file {1}", groups.Count, options.TargetFile); foreach (var group in groups) { //since the items in same group shared same reads, only the first one will be used. for (int i = 1; i < group.Count; i++) { group[0].Name = group[0].Name + "/" + group[i].Name; } group.RemoveRange(1, group.Count - 1); var utr = group[0]; utr.Locations.RemoveAll(m => m.SamLocations.Count == 0); utr.CombineLocationByMappedReads(); foreach (var loc in utr.Locations) { var map = new Dictionary<long, int>(); foreach (var sloc in loc.SamLocations) { for (long i = sloc.SamLocation.Start; i <= sloc.SamLocation.End; i++) { int count; if (map.TryGetValue(i, out count)) { map[i] = count + sloc.SamLocation.Parent.QueryCount; } else { map[i] = sloc.SamLocation.Parent.QueryCount; } } } var keys = (from k in map.Keys orderby k select k).ToList(); int start = 0; int end = start + 1; while (true) { if (end == keys.Count || keys[end] != keys[end - 1] + 1) { var rg = new CoverageRegion(); rg.Name = utr.Name; rg.Seqname = loc.Seqname; rg.Start = keys[start]; rg.End = keys[end - 1]; rg.Strand = loc.Strand; for (int i = start; i < end; i++) { rg.Coverages.Add(map[keys[i]]); } result.Add(rg); if (end == keys.Count) { break; } start = end; end = start + 1; } else { end++; } } } } return result; }
public List<FeatureItemGroup> Build(string countXmlFile) { var result = new FeatureItemGroupXmlFormat().ReadFromFile(countXmlFile); Progress.SetMessage("There are {0} groups in {1}", result.Count, countXmlFile); result.ForEach(g => g.ForEach(smallRNA => smallRNA.Locations.ForEach(region => region.QueryCountBeforeFilter = region.QueryCount))); //no number of no penalty mutation defined, check the T2C if (result.All(m => m.All(l => l.Locations.All(k => k.SamLocations.All(s => s.NumberOfNoPenaltyMutation == 0))))) { foreach (var group in result) { foreach (var smallRNA in group) { smallRNA.Locations.RemoveAll(m => m.SamLocations.Count == 0); foreach (var region in smallRNA.Locations) { region.SamLocations.ForEach(q => { var snp = q.SamLocation.GetNotGsnapMismatch(q.SamLocation.Parent.Sequence); if (null != snp && snp.IsMutation('T', 'C')) { q.NumberOfMismatch = q.SamLocation.NumberOfMismatch - 1; q.NumberOfNoPenaltyMutation = 1; } else { q.NumberOfMismatch = q.SamLocation.NumberOfMismatch; q.NumberOfNoPenaltyMutation = 0; } }); } } } } result.RemoveAll(m => { m.RemoveAll(l => { l.Locations.RemoveAll(k => { k.SamLocations.RemoveAll(s => s.NumberOfNoPenaltyMutation == 0); return k.SamLocations.Count == 0; }); return l.Locations.Count == 0; }); return m.Count == 0; }); Progress.SetMessage("There are {0} groups having T2C mutation", result.Count); foreach (var group in result) { foreach (var smallRNA in group) { foreach (var region in smallRNA.Locations) { region.PValue = CalculateT2CPvalue(region.QueryCountBeforeFilter, region.QueryCount, this.t2cRate); } } } return result; }
public override IEnumerable<string> Process() { var countfiles = options.GetCountFiles(); Dictionary<string, FeatureItem> featureMap = new Dictionary<string, FeatureItem>(); List<string> samples = new List<string>(); for (int i = 0; i < countfiles.Count; i++) { var file = countfiles[i]; samples.Add(file.Name); Progress.SetMessage("Reading {0}/{1} {2}...", i + 1, countfiles.Count, file.File); var mapped = new FeatureItemGroupXmlFormat().ReadFromFile(file.File); mapped.GetQueries().ForEach(m => m.Sample = file.Name); //merge data by feature foreach (var group in mapped) { foreach (var curFeature in group) { FeatureItem existFeature; if (featureMap.TryGetValue(curFeature.Name, out existFeature)) { var existLocationMap = existFeature.Locations.ToDictionary(l => l.GetLocation()); foreach (var curLocation in curFeature.Locations) { FeatureLocation existLocation; if (existLocationMap.TryGetValue(curLocation.GetLocation(), out existLocation)) { existLocation.SamLocations.AddRange(curLocation.SamLocations); } else { existFeature.Locations.Add(curLocation); } } } else // add to feature map { featureMap[curFeature.Name] = curFeature; } } } } var features = featureMap.Values.ToList(); samples.Sort(); var allGroups = new List<FeatureItemGroup>(); var result = new List<string>(); //output miRNA Progress.SetMessage("Grouping microRNA by sequence ..."); var miRNAGroup = features.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); //Progress.SetMessage("Writing microRNA xml file ..."); //new FeatureItemGroupXmlFormat().WriteToFile(options.OutputFile + ".miRNA.xml", miRNAGroup); Progress.SetMessage("Writing microRNA ..."); var miRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".count"); result.AddRange(new MirnaNTACountTableWriter().WriteToFile(miRNAFile, miRNAGroup, samples, SmallRNAConsts.miRNA + ":")); allGroups.AddRange(miRNAGroup); //output tRNA Progress.SetMessage("Grouping tRNA by amino acid code ..."); var tRNAs = features.Where(m => m.Name.StartsWith(SmallRNAConsts.tRNA)).ToList(); var tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTRNACode).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); var tRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".count"); Progress.SetMessage("Writing tRNA ..."); result.AddRange(new SmallRNACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":")); allGroups.AddRange(tRNAGroup); //output tRNA Progress.SetMessage("Grouping tRNA by amino acid ..."); tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTRNAAminoacid, true).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); tRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".aminoacid.count"); Progress.SetMessage("Writing tRNA aminoacid..."); result.AddRange(new SmallRNACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":")); //Progress.SetMessage("Grouping tRNA by identical query ..."); //var tRNAGroup2 = tRNAs.GroupByIdenticalQuery().OrderByDescending(m => m.GetEstimateCount()).ThenBy(m => m.Name).ToList(); //var tRNAFile2 = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".byquery.count"); //Progress.SetMessage("Writing tRNA ..."); //result.AddRange(new SmallRNACountTableWriter().WriteToFile(tRNAFile2, tRNAGroup2, samples, SmallRNAConsts.tRNA + ":")); //output other smallRNA Progress.SetMessage("Grouping other smallRNA by identical query ..."); var otherGroups = features.Where(m => !m.Name.StartsWith(SmallRNAConsts.miRNA) && !m.Name.StartsWith(SmallRNAConsts.tRNA)).GroupByIdenticalQuery().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList(); var otherFile = Path.ChangeExtension(options.OutputFile, ".other.count"); Progress.SetMessage("Writing other smallRNA ..."); result.AddRange(new SmallRNACountTableWriter().WriteToFile(otherFile, otherGroups, samples, "")); var otherSequenceFile = Path.ChangeExtension(options.OutputFile, ".other.sequence.count"); result.AddRange(new SmallRNACountTableSequenceWriter().WriteToFile(otherSequenceFile, otherGroups, "")); allGroups.AddRange(otherGroups); //new FeatureItemGroupXmlFormat().WriteToFile(options.OutputFile + ".other.xml", miRNAGroup); //output all smallRNA Progress.SetMessage("Writing all smallRNA ..."); result.AddRange(new SmallRNACountTableWriter().WriteToFile(options.OutputFile, allGroups, samples, "")); Progress.SetMessage("Done ..."); return result; }
public override IEnumerable<string> Process() { var sampleInfos = new List<SampleCount>(); using (var sw = new StreamWriter(options.OutputFile)) using (var swUnfiltered = new StreamWriter(Path.ChangeExtension(options.OutputFile, ".unfiltered.tsv"))) { var header = "File\tCategory\tName\tUniqueRead\tUniqueT2CRead\tUniqueT2CRate\tAvergeT2CIn10BasesOfUniqueRead\tAvergeT2COfUniqueRead\tTotalRead\tTotalT2CRead\tTotalT2CRate\tT2C_pvalue\tAverageT2CIn10BasesOfTotalRead\tAverageT2COfTotalRead"; swUnfiltered.WriteLine(header); sw.WriteLine(header); var inputFiles = options.GetCountXmlFiles(); foreach (var file in inputFiles) { var sc = new SampleCount(); sc.Name = file.Name; sampleInfos.Add(sc); var subjects = new FeatureItemGroupXmlFormat().ReadFromFile(file.File); var group = subjects.GroupBy(m => m[0].Name.StringBefore(":")).ToList(); foreach (var g in group) { var items = g.ToList(); foreach (var item in items) { var queries = new HashSet<string>(item.GetAlignedLocations().ConvertAll(l => l.Parent.Qname)); List<FeatureSamLocation> locs = new List<FeatureSamLocation>(); foreach (var l in item) { foreach (var loc in l.Locations) { foreach (var sl in loc.SamLocations) { if (queries.Contains(sl.SamLocation.Parent.Qname)) { locs.Add(sl); queries.Remove(sl.SamLocation.Parent.Qname); } } } } var t2c = locs.Where(m => m.NumberOfNoPenaltyMutation > 0).ToList(); var ave_t2c_uniquereads = (t2c.Count > 0) ? t2c.ConvertAll(m => m.NumberOfNoPenaltyMutation * 10.0 / m.SamLocation.Parent.Sequence.Length).Average() : 0.0; var ave_t2c_perread_uniquereads = (t2c.Count > 0) ? t2c.ConvertAll(m => m.NumberOfNoPenaltyMutation).Average() : 0.0; double ave_t2c_allreads = 0.0; double ave_t2c_perread_allreads = 0.0; if (t2c.Count > 0) { List<double> values = new List<double>(); List<double> perread_values = new List<double>(); foreach (var t2citem in t2c) { var v = t2citem.NumberOfNoPenaltyMutation * 10.0 / t2citem.SamLocation.Parent.Sequence.Length; for (int i = 0; i < t2citem.SamLocation.Parent.QueryCount; i++) { values.Add(v); perread_values.Add(t2citem.NumberOfNoPenaltyMutation); } } ave_t2c_allreads = values.Average(); ave_t2c_perread_allreads = perread_values.Average(); } var totalCount = locs.Sum(l => l.SamLocation.Parent.QueryCount); var totalT2CCount = t2c.Sum(l => l.SamLocation.Parent.QueryCount); var pvalue = SmallRNAT2CMutationBuilder.CalculateT2CPvalue(totalCount, totalT2CCount, options.ExpectRate); var t2crate = totalT2CCount == 0 ? 0 : totalT2CCount * 1.0 / totalCount; var value = string.Format("{0}\t{1}\t{2}\t{3:0.###}\t{4:0.###}\t{5:0.###}\t{6:0.###}\t{7:0.###}\t{8:0.###}\t{9:0.###}\t{10:0.###}\t{11:0.###E+0}\t{12:0.###}\t{13:0.###}", file.Name, g.Key, item.Name, locs.Count, t2c.Count, t2c.Count * 1.0 / locs.Count, ave_t2c_uniquereads, ave_t2c_perread_uniquereads, totalCount, totalT2CCount, t2crate, pvalue, ave_t2c_allreads, ave_t2c_perread_allreads); swUnfiltered.WriteLine(value); if(!ParclipSmallRNAT2CBuilder.Accept(pvalue, totalCount, totalT2CCount, options.Pvalue, options.MinimumCount, options.ExpectRate)) { continue; } sw.WriteLine(value); sc.GoodReadCount += totalCount; sc.GoodT2CReadCount += totalT2CCount; if (g.Key.Equals(SmallRNAConsts.miRNA)) { sc.MiRNACount++; } else if (g.Key.Equals(SmallRNAConsts.tRNA)) { sc.TRNACount++; } else { sc.OtherSmallRNACount++; } } } } } using (var sw = new StreamWriter(options.OutputFile + ".summary")) { sw.WriteLine("File\tTotalRead\tT2CRead\tT2CRate\tSmallRNA\tMicroRNA\ttRNA\tOtherSmallRNA"); foreach (var si in sampleInfos) { sw.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}", si.Name, si.GoodReadCount, si.GoodT2CReadCount, si.GoodT2CRate, si.SmallRNACount, si.MiRNACount, si.TRNACount, si.OtherSmallRNACount); } } return new[] { Path.GetFullPath(options.OutputFile), Path.GetFullPath(options.OutputFile + ".summary") }; }
public override IEnumerable<string> Process() { var result = new List<string>(); var except = new HashSet<string>(); if (File.Exists(options.XmlFile)) { //exclude the reads mapped to features no matter how many number of mismatch it has var allmapped = new FeatureItemGroupXmlFormat().ReadFromFile(options.XmlFile); except.UnionWith(from g in allmapped from f in g from l in f.Locations from sl in l.SamLocations select sl.SamLocation.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG)); } if (File.Exists(options.ExcludeFile)) { except.UnionWith(from l in File.ReadAllLines(options.ExcludeFile) select l.StringBefore(SmallRNAConsts.NTA_TAG)); } SmallRNACountMap cm = options.GetCountMap(); var keys = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray(); foreach (var key in keys) { cm.Counts[key.StringBefore(SmallRNAConsts.NTA_TAG)] = cm.Counts[key]; } StreamWriter swCount = null; if (File.Exists(options.CountFile)) { swCount = new StreamWriter(options.OutputFile + ".dupcount"); } Progress.SetMessage("output unmapped query..."); try { using (var sw = StreamUtils.GetWriter(options.OutputFile, options.OutputFile.ToLower().EndsWith(".gz"))) { using (var sr = StreamUtils.GetReader(options.InputFile)) { FastqReader reader = new FastqReader(); FastqWriter writer = new FastqWriter(); FastqSequence ss; var count = 0; while ((ss = reader.Parse(sr)) != null) { count++; if (count % 100000 == 0) { Progress.SetMessage("{0} reads", count); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description; if (except.Contains(ss.Name)) { continue; } if (Accept != null && !Accept(ss)) { continue; } except.Add(ss.Name); writer.Write(sw, ss); if (swCount != null) { int cmcount; if (!cm.Counts.TryGetValue(ss.Name, out cmcount)) { throw new Exception(string.Format("Cannot find {0} in count map", ss.Name)); } swCount.WriteLine("{0}\t{1}", ss.Name, cmcount); } } } } } finally { if (swCount != null) { swCount.Close(); } } Progress.End(); return result; }