public virtual List <FeatureLocation> GetSequenceRegions() { //Read sequence regions var items = SequenceRegionUtils.GetSequenceRegions(CoordinateFile); items.ForEach(m => { if (m.Seqname.StartsWith("chr")) { m.Seqname = m.Seqname.StringAfter("chr"); } }); //Fill sequence information, only miRNA and tRNA will be filled. if (!string.IsNullOrEmpty(this.FastaFile)) { Console.WriteLine("Reading sequence from {0} ...", this.FastaFile); var seqs = SequenceUtils.Read(new FastaFormat(), this.FastaFile).ToDictionary(m => m.Name); items.ForEach(m => { if (m.Name.StartsWith(SmallRNAConsts.miRNA) || m.Name.StartsWith(SmallRNAConsts.tRNA)) { if (seqs.ContainsKey(m.Name)) { m.Sequence = seqs[m.Name].SeqString; } else { Console.WriteLine("Missing sequence: " + m.Name); } } else { m.Sequence = string.Empty; } }); seqs.Clear(); } var result = items.ConvertAll(m => new FeatureLocation(m)).ToList(); result.ForEach(m => { foreach (var categoryName in SmallRNAConsts.Biotypes) { if (m.Name.StartsWith(categoryName)) { m.Category = categoryName; } } }); return(result); }
public MappedMirnaRegion FindOrCreateRegion(string loc) { foreach (var region in MappedRegions) { if (region.Region.GetLocation().Equals(loc)) { return(region); } } var result = new MappedMirnaRegion(); result.Region = SequenceRegionUtils.ParseLocation <SequenceRegion>(loc); this.MappedRegions.Add(result); return(result); }
public virtual List <FeatureLocation> GetSequenceRegions() { //Read sequence regions var result = SequenceRegionUtils.GetSequenceRegions(CoordinateFile, GtfFeatureName, BedAsGtf); result.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); }); //Fill sequence information var sr = result.FirstOrDefault(m => m.Name.Contains(":")); if (sr != null) { var sequence = sr.Name.StringAfter(":"); if (sequence.All(m => MIRNA.Contains(m))) { result.ForEach(m => m.Sequence = m.Name.StringAfter(":")); result.ForEach(m => m.Name = m.Name.StringBefore(":")); } } if (!string.IsNullOrEmpty(this.FastaFile)) { Console.WriteLine("Reading sequence from {0} ...", this.FastaFile); var seqs = SequenceUtils.Read(new FastaFormat(), this.FastaFile).ToDictionary(m => m.Name); result.ForEach(m => { if (seqs.ContainsKey(m.Name)) { m.Sequence = seqs[m.Name].SeqString; } else { Console.WriteLine("Missing sequence: " + m.Name); } }); seqs.Clear(); } return(result.ConvertAll(m => new FeatureLocation(m)).ToList()); }
public override IEnumerable <string> Process() { List <GtfItem> items = new List <GtfItem>(); foreach (var corfile in _options.CoordinateFiles) { var curitems = SequenceRegionUtils.GetSequenceRegions(corfile); if (_options.Features != null && _options.Features.Count > 0) { items.RemoveAll(m => !_options.Features.Contains(m.Feature)); } if (!string.IsNullOrEmpty(_options.NameKey)) { var key = _options.NameKey + "="; items.ForEach(m => { if (!string.IsNullOrEmpty(m.Attributes) && m.Attributes.Contains(key)) { m.Name = m.Attributes.StringAfter(key).StringBefore(";").Trim(); } }); } items.AddRange(curitems); } var removechr = !items.All(m => m.Seqname.StartsWith("chr")); if (removechr) { items.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr")); } var map = items.GroupBy(m => m.Seqname).ToDictionary(m => m.Key, m => m.ToList()); using (StreamWriter sw = new StreamWriter(_options.OutputFile)) { using (var sr = new StreamReader(_options.InputFile)) { string line; var headers = sr.ReadLine().Split(',').ToList(); headers.Add("ClosetFeature"); headers.Add("ClosetFeatureLocus"); headers.Add("ClosetFeatureDistance"); sw.WriteLine(headers.Merge(',')); List <GtfItem> mingtfs = new List <GtfItem>(); while ((line = sr.ReadLine()) != null) { var parts = line.Split(','); var chr = parts[0]; if (removechr) { chr = chr.StringAfter("chr"); } var start = long.Parse(parts[2]); var end = long.Parse(parts[3]); var sequence = parts[5]; var location = int.Parse(parts[7]); var t2c_feature = string.Empty; List <GtfItem> gtfs; if (!map.TryGetValue(chr, out gtfs)) { sw.WriteLine("{0},,,", line); continue; } long mindist = int.MaxValue; mingtfs.Clear(); foreach (var gtf in gtfs) { long dist; if (gtf.Start > end) { dist = gtf.Start - end; } else if (gtf.End < start) { dist = start - gtf.End; } else { dist = 0; } if (dist < mindist) { mingtfs.Clear(); mingtfs.Add(gtf); mindist = dist; } else if (dist == mindist) { mingtfs.Add(gtf); } } sw.WriteLine("{0},{1},{2},{3}", line, (from m in mingtfs select m.Name).Merge(";"), (from m in mingtfs select m.GetLocation()).Merge(";"), mindist); } } } return(new string[] { _options.OutputFile }); }
public override IEnumerable <string> Process() { PileupCountList pc = new PileupCountList(); var format = options.GetSAMFormat(); var cm = new SmallRNACountMap(options.CountFile); var srItems = SequenceRegionUtils.GetSequenceRegions(options.CoordinateFile, "miRNA", options.BedAsGtf); srItems.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); }); var srmap = srItems.GroupBy(m => m.Seqname).ToDictionary(m => m.Key, m => m.ToList()); StreamWriter swScript = null; try { if (options.ExportIgvScript) { swScript = new StreamWriter(options.OutputFile + ".igv"); swScript.WriteLine("snapshotDirectory {0}", Path.GetDirectoryName(options.OutputFile).Replace('\\', '/')); } using (StreamWriter sw = new StreamWriter(options.OutputFile)) { sw.WriteLine(@"##fileformat=VCFv4.2 ##fileDate={0:yyyyMMdd} ##source={1} ##phasing=partial ##INFO=<ID=NS,Number=1,Type=Integer,Description=""Number of Samples With Data""> ##INFO=<ID=DP,Number=1,Type=Integer,Description=""Total Depth""> ##INFO=<ID=AF,Number=A,Type=Float,Description=""Allele Frequency""> ##INFO=<ID=FP,Number=1,Type=Float,Description=""Fisher Exact Test P-Value""> ##INFO=<ID=MN,Number=.,Type=String,Description=""miRNA name contains this position""> ##FILTER=<ID=FisherET,Description=""Fisher exact test Pvalue less than {2}""> ##FILTER=<ID=AltAlleFreq,Description=""Alternative allele frequency less than {3}""> ##FILTER=<ID=notMiRNA,Description=""Position not located in miRNA locus""> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description=""Read Depth""> ##FORMAT=<ID=AD,Number=1,Type=Integer,Description=""Allelic Depth""> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT {4}", DateTime.Now, "PileupCountBuilder", options.FisherPValue, options.MinimumAlternativeAlleleFrequency, Path.GetFileNameWithoutExtension(options.InputFile)); using (var sr = SAMFactory.GetReader(options.InputFile, true)) { int count = 0; string line; while ((line = sr.ReadLine()) != null) { count++; if (count % 100 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (count % 100000 == 0) { Progress.SetMessage("{0} reads processed", count); } var parts = line.Split('\t'); var qname = parts[SAMFormatConst.QNAME_INDEX]; var seq = parts[SAMFormatConst.SEQ_INDEX]; //too short if (seq.Length < options.MinimumReadLength) { continue; } SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } var cigar = parts[SAMFormatConst.CIGAR_INDEX]; //insertion/deletion if (cigar.Any(m => m == 'I' || m == 'D')) { continue; } var sam = new SAMAlignedItem() { Qname = qname, }; bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand); char strand; if (isReversed) { strand = '-'; sam.Sequence = SequenceUtils.GetReverseComplementedSequence(seq); } else { strand = '+'; sam.Sequence = seq; } var loc = new SAMAlignedLocation(sam) { Seqname = parts[SAMFormatConst.RNAME_INDEX].StringAfter("chr"), Start = int.Parse(parts[SAMFormatConst.POS_INDEX]), Strand = strand, Cigar = parts[SAMFormatConst.CIGAR_INDEX], MismatchPositions = format.GetMismatchPositions(parts), NumberOfMismatch = format.GetNumberOfMismatch(parts), Sequence = seq }; loc.ParseEnd(sam.Sequence); sam.AddLocation(loc); if (format.HasAlternativeHits) { format.ParseAlternativeHits(parts, sam); } var finished = pc.Add(sam, cm.GetCount(sam.Qname)); if (null == finished || 0 == finished.Count) { continue; } foreach (var fin in finished) { //if (fin.Chromosome.Equals("1") && fin.Position == 5160725) //{ // Console.WriteLine(fin); //} var ft = fin.FisherExactTest(); if (ft.PValue <= options.FisherPValue) { var total = fin.Sum(m => m.Value); var minallele = total * options.MinimumAlternativeAlleleFrequency; if (ft.Sample2.Failed >= minallele) { List <GtfItem> srs; List <string> ranges = new List <string>(); if (srmap.TryGetValue(sam.Locations[0].Seqname, out srs)) { foreach (var seqr in srs) { if (seqr.Contains(fin.Position)) { ranges.Add(seqr.GetNameLocation()); } } } var alter = (from r in fin where r.Key != fin.Reference orderby r.Key select r).ToList(); var str = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\tNS={7};DP={8};AF={9};FP={10:0.##E0}{11}\tDP:AD\t{12}:{13},{14}", fin.Chromosome, fin.Position, ".", fin.Reference, (from r in alter select r.Key.ToString()).Merge(","), 0, ranges.Count == 0 ? "notMiRNA" : "PASS", 1, total, (from r in alter select string.Format("{0:0.###}", r.Value * 1.0 / total)).Merge(","), ft.PValue, ranges.Count == 0 ? "" : ";" + ranges.Merge(","), total, ft.Sample2.Succeed, (from r in alter select r.Value.ToString()).Merge(",")); sw.WriteLine(str); //Console.WriteLine(str); if (swScript != null && ranges.Count > 0) { swScript.WriteLine(@"goto {0}:{1} sort position snapshot {0}_{2}_{1}.png", fin.Chromosome, fin.Position, ranges[0].Replace('(', '_').Replace(')', '_').Replace(':', '_')); } } } } finished.Clear(); } } } } finally { if (swScript != null) { swScript.Close(); } } return(new string[] { options.OutputFile }); }
public override IEnumerable <string> Process() { var srItems = SequenceRegionUtils.GetSequenceRegions(options.InputFile).Where(m => options.AcceptName(m.Name)).ToList(); srItems = (from sr in srItems.GroupBy(m => m.Name) select sr.First()).ToList(); var keepChrInName = options.KeepChrInName && srItems.Any(m => m.Name.StartsWith("chr")); if (!keepChrInName) { srItems.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr")); } var srMap = srItems.ToGroupDictionary(m => m.Seqname); var ff = new FastaFormat(int.MaxValue); using (StreamWriter sw = new StreamWriter(options.OutputFile)) { using (StreamReader sr = new StreamReader(options.GenomeFastaFile)) { Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetMessage("processing " + seq.Name + " ..."); var name = seq.Name; if (!keepChrInName) { name = name.StringAfter("chr"); } List <GtfItem> items; if (!srMap.TryGetValue(name, out items)) { if (name.Equals("M")) { name = "MT"; srMap.TryGetValue(name, out items); } else if (name.Equals("chrM")) { name = "chrMT"; srMap.TryGetValue(name, out items); } else if (name.Equals("MT")) { name = "M"; srMap.TryGetValue(name, out items); } else if (name.Equals("chrMT")) { name = "chrM"; srMap.TryGetValue(name, out items); } } if (items != null) { Progress.SetMessage(" there are {0} entries in {1} ...", items.Count, name); foreach (var item in items) { if (item.Start - 1 + item.Length >= seq.SeqString.Length) { throw new Exception(string.Format("{0} exceed chromosome {1} length {2}", item, name, seq.SeqString.Length)); } var newseq = seq.SeqString.Substring((int)item.Start - 1, (int)item.Length); if (item.Strand == '-') { newseq = SequenceUtils.GetReverseComplementedSequence(newseq); } newseq = newseq.ToUpper(); var newname = string.Format("{0} {1} {2}", item.Name, item.GetLocationWithoutStrand(), item.Strand); var entry = new Sequence(newname, newseq); ff.WriteSequence(sw, entry); } } } } } return(new string[] { options.OutputFile }); }