public void TestRead() { var lst = new BedItemFile<InsertionDeletionItem>().ReadFromFile(@"../../../data/tophat_deletions.bed"); Assert.AreEqual(12, lst.Count); Assert.AreEqual("1", lst[0].Seqname); Assert.AreEqual(13656, lst[0].Start); Assert.AreEqual(13658, lst[0].End); Assert.AreEqual("-", lst[0].Name); Assert.AreEqual(1, lst[0].Score); Assert.AreEqual("Y", lst[5].Seqname); Assert.AreEqual(59359640, lst[5].Start); Assert.AreEqual(59359642, lst[5].End); Assert.AreEqual("-", lst[5].Name); Assert.AreEqual(10, lst[5].Score); Assert.AreEqual("1", lst[6].Seqname); Assert.AreEqual(12663, lst[6].Start); Assert.AreEqual(12663, lst[6].End); Assert.AreEqual("GTC", lst[6].Name); Assert.AreEqual(1, lst[6].Score); Assert.AreEqual("Y", lst.Last().Seqname); Assert.AreEqual(59359023, lst.Last().Start); Assert.AreEqual(59359023, lst.Last().End); Assert.AreEqual("CCCC", lst.Last().Name); Assert.AreEqual(2, lst.Last().Score); }
/// <summary> /// Transfer bed format (zero-based) to gff format (one-based) /// </summary> /// <param name="options"></param> /// <param name="progress"></param> /// <returns></returns> public static List <CoverageRegion> GetTargetCoverageRegionFromBed(ITargetBuilderOptions options, IProgressCallback progress) { var result = new List <CoverageRegion>(); var groups = new BedItemFile <BedItem>().ReadFromFile(options.TargetFile); progress.SetMessage("Total {0} potential target group read from file {1}", groups.Count, options.TargetFile); foreach (var utr in groups) { var rg = new CoverageRegion(); rg.Name = utr.Name; rg.Seqname = utr.Seqname.StringAfter("chr"); rg.Start = utr.Start + 1; rg.End = utr.End; rg.Strand = utr.Strand; for (var i = rg.Start; i < rg.End; i++) { rg.Coverages.Add(new CoverageSite(DEFAULT_COVERAGE)); } result.Add(rg); } return(result); }
public List <JunctionItem> ReadFromFile(string fileName) { var beds = new BedItemFile <BedItem>().ReadFromFile(fileName); List <JunctionItem> result = (from bed in beds select new JunctionItem() { Chr = bed.Seqname, Start1 = bed.Blocks[0].ChromStart, End1 = bed.Blocks[0].ChromEnd, Start2 = bed.Blocks[1].ChromStart, End2 = bed.Blocks[1].ChromEnd, Name = bed.Name }).ToList(); return(result); }
public void TestReadSlimData() { List<BedItem> items =new BedItemFile<BedItem>().ReadFromFile(filename); Assert.AreEqual(11, items.Count); Assert.AreEqual("21", items[0].Seqname); Assert.AreEqual(9827125, items[0].Start); Assert.AreEqual(9827151, items[0].End); Assert.AreEqual("21-1", items[0].Name); Assert.AreEqual(1182877.2, items[0].Score); Assert.AreEqual('+', items[0].Strand); Assert.AreEqual("12", items[10].Seqname); Assert.AreEqual(6647087, items[10].Start); Assert.AreEqual(6647113, items[10].End); Assert.AreEqual("12-4", items[10].Name); Assert.AreEqual(49330.1, items[10].Score); Assert.AreEqual('-', items[10].Strand); }
public override IEnumerable <string> Process() { var hasheader = new StreamReader(options.BedFile).ReadLine().Contains("start"); var beds = new BedItemFile <BedItem>() { HasHeader = hasheader }.ReadFromFile(options.BedFile); var items = new CNVItemReader <CNVItem>().ReadFromFile(options.InputFile); var itemsgroup = items.GroupBy(m => m.Seqname.StringAfter("chr")); var bedgroups = beds.GroupBy(m => m.Seqname).ToDictionary(m => m.Key); foreach (var ig in itemsgroup) { if (!bedgroups.ContainsKey(ig.Key)) { throw new Exception(string.Format("Cannot find chromosome {0} in bed file {1}", ig.Key, options.BedFile)); } } foreach (var ig in itemsgroup) { var bg = bedgroups[ig.Key]; foreach (var item in ig) { var bgmax = FindMaxOverlap(bg, item); if (bgmax != null) { item.ItemName = bgmax.Name; } } } items.RemoveAll(m => string.IsNullOrWhiteSpace(m.ItemName)); var itemmap = items.GroupBy(m => m.ItemName).ToDictionary(m => m.Key); var genes = itemmap.Keys.OrderBy(m => m).ToList(); var samples = items.ConvertAll(m => m.FileName).Distinct().OrderBy(m => m).ToList(); using (var sw = new StreamWriter(options.OutputFile)) { sw.WriteLine("gene,DELETION,DUPLICATION," + samples.Merge(",")); foreach (var gene in genes) { var sas = itemmap[gene].GroupBy(m => m.FileName).ToDictionary(m => m.Key); sw.Write(gene); var deletioncount = samples.Count(m => sas.ContainsKey(m) && sas[m].Any(n => n.ItemType == CNVType.DELETION)); var duplicationcount = samples.Count(m => sas.ContainsKey(m) && sas[m].Any(n => n.ItemType == CNVType.DUPLICATION)); sw.Write(",{0:0.0}%,{1:0.0}%", deletioncount * 100.0 / samples.Count, duplicationcount * 100.0 / samples.Count); foreach (var sample in samples) { sw.Write(","); if (sas.ContainsKey(sample)) { var sis = sas[sample]; if (sis.Any(m => !m.ItemType.Equals(sis.First().ItemType))) { Console.WriteLine("Different type of gene " + gene + " in sample " + sample); } sw.Write((from si in sis select string.Format("{0}:{1}:{2}-{3}", si.ItemType, si.Seqname, si.Start, si.End)).Merge("/")); } } sw.WriteLine(); } } return(new string[] { options.OutputFile }); }
public override IEnumerable<string> Process() { var paramFile = options.OutputFile + ".param"; if (string.IsNullOrEmpty(options.ParamFile) || !Path.GetFullPath(options.ParamFile).Equals(Path.GetFullPath(paramFile))) { options.SaveToFile(options.OutputFile + ".param"); } var bedfile = new BedItemFile<BedItem>(6); var mirnas = new List<BedItem>(); if (File.Exists(options.MiRBaseFile)) { Progress.SetMessage("Processing {0} ...", options.MiRBaseFile); if (options.MiRBaseFile.EndsWith(".bed")) { mirnas = bedfile.ReadFromFile(options.MiRBaseFile); mirnas.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); m.Name = options.MiRBaseKey + ":" + m.Name; }); } else { using (var gf = new GtfItemFile(options.MiRBaseFile)) { GtfItem item; while ((item = gf.Next(options.MiRBaseKey)) != null) { BedItem loc = new BedItem(); loc.Seqname = item.Seqname.StringAfter("chr"); loc.Start = item.Start - 1; loc.End = item.End; loc.Name = options.MiRBaseKey + ":" + item.Attributes.StringAfter("Name=").StringBefore(";"); loc.Score = 1000; loc.Strand = item.Strand; mirnas.Add(loc); } } } Progress.SetMessage("{0} miRNA read.", mirnas.Count); } List<BedItem> trnas = new List<BedItem>(); if (File.Exists(options.UcscTrnaFile)) { //reading tRNA from ucsc table without mitocondrom tRNA Progress.SetMessage("Processing {0} ...", options.UcscTrnaFile); trnas = bedfile.ReadFromFile(options.UcscTrnaFile); trnas.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr")); //remove the tRNA not from 1-22, X and Y trnas.RemoveAll(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))); //mitocondrom tRNA will be extracted from ensembl gtf file trnas.RemoveAll(m => m.Seqname.Equals("M") || m.Seqname.Equals("MT")); trnas.ForEach(m => m.Name = SmallRNAConsts.tRNA + ":" + m.Name); Progress.SetMessage("{0} tRNA from ucsc read.", trnas.Count); } var others = new List<BedItem>(); if (File.Exists(options.EnsemblGtfFile)) { //reading smallRNA/tRNA from ensembl gtf file Progress.SetMessage("Processing {0} ...", options.EnsemblGtfFile); using (var gf = new GtfItemFile(options.EnsemblGtfFile)) { var biotypes = new HashSet<string>(SmallRNAConsts.Biotypes); biotypes.Remove(SmallRNAConsts.miRNA); GtfItem item; int count = 0; while ((item = gf.Next("gene")) != null) { string biotype; if (item.Attributes.Contains("gene_biotype")) { biotype = item.Attributes.StringAfter("gene_biotype \"").StringBefore("\""); } else if (item.Attributes.Contains("gene_type")) { biotype = item.Attributes.StringAfter("gene_type \"").StringBefore("\""); } else { continue; } if (File.Exists(options.UcscTrnaFile) && biotype.Equals(SmallRNAConsts.tRNA)) { continue; } if (biotype.Equals("Mt_tRNA")) { count++; var gene_name = item.Attributes.Contains("gene_name") ? item.Attributes.StringAfter("gene_name \"").StringBefore("\"") : item.GeneId; BedItem loc = new BedItem(); loc.Seqname = "MT"; loc.Start = item.Start - 1; loc.End = item.End; loc.Name = string.Format(SmallRNAConsts.tRNA + ":chrMT.tRNA{0}-{1}", count, gene_name.StringAfter("-")); loc.Score = 1000; loc.Strand = item.Strand; trnas.Add(loc); } else if (biotypes.Contains(biotype)) { string seqName; if (item.Seqname.ToLower().StartsWith("chr")) { seqName = item.Seqname.Substring(3); } else { seqName = item.Seqname; } if (seqName.Equals("M")) { seqName = "MT"; } //ignore all smallRNA coordinates on scaffold or contig. if (seqName.Length > 5) { continue; } var gene_name = item.Attributes.StringAfter("gene_name \"").StringBefore("\""); BedItem loc = new BedItem(); loc.Seqname = seqName; loc.Start = item.Start - 1; loc.End = item.End; loc.Name = biotype + ":" + gene_name + ":" + item.GeneId; loc.Score = 1000; loc.Strand = item.Strand; others.Add(loc); } } } } var all = new List<BedItem>(); all.AddRange(mirnas); all.AddRange(trnas); all.AddRange(others); if (File.Exists(options.RRNAFile)) { var seqs = SequenceUtils.Read(options.RRNAFile); foreach(var seq in seqs) { all.Add(new BedItem() { Seqname = seq.Name, Start = 0, End = seq.SeqString.Length, Strand = '+', Name = "rRNA:" + seq.Name }); } } Progress.SetMessage("Saving smallRNA coordinates to " + options.OutputFile + "..."); using (var sw = new StreamWriter(options.OutputFile)) { foreach (var pir in SmallRNAConsts.Biotypes) { var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } Progress.SetMessage("Extracting sequence from " + options.FastaFile + "..."); new Bed2FastaProcessor(new Bed2FastaProcessorOptions() { GenomeFastaFile = options.FastaFile, InputFile = options.OutputFile, OutputFile = options.OutputFile + ".fa", KeepChrInName = false, AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.tRNA), }) { Progress = this.Progress }.Process(); var summaryFile = options.OutputFile + ".info"; Progress.SetMessage("Writing summary to " + summaryFile + "..."); using (var sw = new StreamWriter(summaryFile)) { sw.WriteLine("Biotype\tCount"); all.ConvertAll(m => m.Name).Distinct().GroupBy(m => m.StringBefore(":")).OrderByDescending(m => m.Count()).ToList().ForEach(m => sw.WriteLine("{0}\t{1}", m.Key, m.Count())); } return new string[] { options.OutputFile }; }
/// <summary> /// Transfer bed format (zero-based) to gff format (one-based) /// </summary> /// <param name="options"></param> /// <param name="progress"></param> /// <returns></returns> public static List<CoverageRegion> GetTargetCoverageRegionFromBed(ITargetBuilderOptions options, IProgressCallback progress) { var result = new List<CoverageRegion>(); var groups = new BedItemFile<BedItem>().ReadFromFile(options.TargetFile); progress.SetMessage("Total {0} potential target group read from file {1}", groups.Count, options.TargetFile); foreach (var utr in groups) { var rg = new CoverageRegion(); rg.Name = utr.Name; rg.Seqname = utr.Seqname.StringAfter("chr"); rg.Start = utr.Start + 1; rg.End = utr.End; rg.Strand = utr.Strand; for (var i = rg.Start; i < rg.End; i++) { rg.Coverages.Add(1000); } result.Add(rg); } return result; }
public override IEnumerable <string> Process() { var paramFile = options.OutputFile + ".param"; options.SaveToFile(options.OutputFile + ".param"); var bedfile = new BedItemFile <BedItem>(6); Progress.SetMessage("building chromosome name map ..."); var mitoName = "M"; Dictionary <string, string> chrNameMap = new Dictionary <string, string>(); var ff = new FastaFormat(int.MaxValue); var faiFile = options.FastaFile + ".fai"; if (File.Exists(faiFile)) { using (StreamReader sr = new StreamReader(faiFile)) { string line; while ((line = sr.ReadLine()) != null) { var name = line.Split('\t')[0]; chrNameMap[name] = name; if (name.StartsWith("chr")) { chrNameMap[name.StringAfter("chr")] = name; } if (!name.StartsWith("chr")) { chrNameMap["chr" + name] = name; } if (name.Equals("chrMT") || name.Equals("MT")) { mitoName = "MT"; } if (name.Equals("chrM") || name.Equals("M")) { mitoName = "M"; } } } } else { using (StreamReader sr = new StreamReader(options.FastaFile)) { Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { var name = seq.Name; chrNameMap[name] = name; if (name.StartsWith("chr")) { chrNameMap[name.StringAfter("chr")] = name; } if (!name.StartsWith("chr")) { chrNameMap["chr" + name] = name; } if (name.Equals("chrMT") || name.Equals("MT")) { mitoName = "MT"; } if (name.Equals("chrM") || name.Equals("M")) { mitoName = "M"; } } } } var longMitoName = chrNameMap[mitoName]; Progress.SetMessage("mitochondral chromosome name = {0}", longMitoName); var mirnas = new List <BedItem>(); if (File.Exists(options.MiRBaseFile)) { Progress.SetMessage("Processing {0} ...", options.MiRBaseFile); if (options.MiRBaseFile.EndsWith(".bed")) { mirnas = bedfile.ReadFromFile(options.MiRBaseFile); mirnas.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); m.Name = options.MiRBaseKey + ":" + m.Name; }); } else { using (var gf = new GtfItemFile(options.MiRBaseFile)) { GtfItem item; while ((item = gf.Next(options.MiRBaseKey)) != null) { BedItem loc = new BedItem(); loc.Seqname = item.Seqname.StringAfter("chr"); loc.Start = item.Start - 1; loc.End = item.End; loc.Name = options.MiRBaseKey + ":" + item.Attributes.StringAfter("Name=").StringBefore(";"); loc.Score = 1000; loc.Strand = item.Strand; mirnas.Add(loc); } } } Progress.SetMessage("{0} miRNA readed.", mirnas.Count); } List <BedItem> trnas = new List <BedItem>(); if (File.Exists(options.UcscTrnaFile)) { //reading tRNA from ucsc table without mitocondrom tRNA Progress.SetMessage("Processing {0} ...", options.UcscTrnaFile); trnas = bedfile.ReadFromFile(options.UcscTrnaFile); trnas.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr")); var removed = trnas.Where(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))).ToList(); if (removed.Count != trnas.Count) { //remove the tRNA not from 1-22, X and Y trnas.RemoveAll(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))); //mitocondrom tRNA will be extracted from ensembl gtf file trnas.RemoveAll(m => m.Seqname.Equals("M") || m.Seqname.Equals("MT")); } trnas.ForEach(m => m.Name = GetTRNAName(m.Name)); Progress.SetMessage("{0} tRNA from ucsc readed.", trnas.Count); if (File.Exists(options.UcscMatureTrnaFastaFile)) { var seqs = SequenceUtils.Read(options.UcscMatureTrnaFastaFile); foreach (var seq in seqs) { var tRNAName = GetTRNAName(seq.Name); trnas.Add(new BedItem() { Seqname = seq.Name, Start = 0, End = seq.SeqString.Length, Strand = '+', Name = tRNAName, Sequence = seq.SeqString }); } } } var others = new List <BedItem>(); if (File.Exists(options.EnsemblGtfFile)) { //reading smallRNA/tRNA from ensembl gtf file Progress.SetMessage("Processing {0} ...", options.EnsemblGtfFile); using (var gf = new GtfItemFile(options.EnsemblGtfFile)) { var biotypes = new HashSet <string>(SmallRNAConsts.Biotypes); biotypes.Remove(SmallRNAConsts.miRNA); GtfItem item; int count = 0; while ((item = gf.Next("gene")) != null) { string biotype; if (item.Attributes.Contains("gene_biotype")) { biotype = item.Attributes.StringAfter("gene_biotype \"").StringBefore("\""); } else if (item.Attributes.Contains("gene_type")) { biotype = item.Attributes.StringAfter("gene_type \"").StringBefore("\""); } else { continue; } if (File.Exists(options.UcscTrnaFile) && biotype.Equals(SmallRNAConsts.tRNA)) { continue; } if (biotype.Equals("Mt_tRNA")) { count++; var gene_name = item.Attributes.Contains("gene_name") ? item.Attributes.StringAfter("gene_name \"").StringBefore("\"") : item.GeneId; BedItem loc = new BedItem(); loc.Seqname = mitoName; loc.Start = item.Start - 1; loc.End = item.End; loc.Name = string.Format(SmallRNAConsts.mt_tRNA + ":" + longMitoName + ".tRNA{0}-{1}", count, gene_name.StringAfter("-")); loc.Score = 1000; loc.Strand = item.Strand; trnas.Add(loc); } else if (biotypes.Contains(biotype)) { string seqName; if (item.Seqname.ToLower().StartsWith("chr")) { seqName = item.Seqname.Substring(3); } else { seqName = item.Seqname; } if (seqName.Equals("M") || seqName.Equals("MT")) { seqName = mitoName; } //ignore all smallRNA coordinates on scaffold or contig. //if (seqName.Length > 5) //{ // continue; //} var gene_name = item.Attributes.StringAfter("gene_name \"").StringBefore("\""); var lowGeneName = gene_name.ToLower(); if (lowGeneName.StartsWith("rny") || lowGeneName.Equals("y_rna")) { biotype = "yRNA"; } BedItem loc = new BedItem(); loc.Seqname = seqName; loc.Start = item.Start - 1; loc.End = item.End; //if (lowGeneName.EndsWith("_rrna") && loc.Length < 200) //{ // biotype = "rRNA"; //} loc.Name = biotype + ":" + gene_name + ":" + item.GeneId; loc.Score = 1000; loc.Strand = item.Strand; others.Add(loc); } } } } var all = new List <BedItem>(); all.AddRange(mirnas); all.AddRange(trnas); all.AddRange(others); foreach (var bi in all) { if (chrNameMap.ContainsKey(bi.Seqname)) { bi.Seqname = chrNameMap[bi.Seqname]; } } if (File.Exists(options.RRNAFile)) { var seqs = SequenceUtils.Read(options.RRNAFile); foreach (var seq in seqs) { all.Add(new BedItem() { Seqname = seq.Name, Start = 0, End = seq.SeqString.Length, Strand = '+', Name = "rRNA:" + SmallRNAConsts.rRNADB_KEY + seq.Name }); } } Progress.SetMessage("Saving smallRNA coordinates to " + options.OutputFile + "..."); using (var sw = new StreamWriter(options.OutputFile)) { foreach (var pir in SmallRNAConsts.Biotypes) { var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } var miRNA_bed = FileUtils.ChangeExtension(options.OutputFile, ".miRNA.bed"); Progress.SetMessage("Saving miRNA coordinates to " + miRNA_bed + "..."); using (var sw = new StreamWriter(miRNA_bed)) { var pir = SmallRNAConsts.miRNA; var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss1 ..."); using (var sw = new StreamWriter(options.OutputFile + ".miss1")) { foreach (var pir in SmallRNAConsts.Biotypes) { if (pir == SmallRNABiotype.lincRNA.ToString() || pir == SmallRNABiotype.lncRNA.ToString()) { continue; } var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); locs.RemoveAll(l => l.Name.Contains(SmallRNAConsts.rRNADB_KEY)); Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss0 ..."); using (var sw = new StreamWriter(options.OutputFile + ".miss0")) { foreach (var pir in SmallRNAConsts.Biotypes) { if (pir != SmallRNABiotype.lincRNA.ToString() && pir != SmallRNABiotype.lncRNA.ToString() && pir != SmallRNABiotype.rRNA.ToString()) { continue; } var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); if (pir == SmallRNABiotype.rRNA.ToString()) { locs.RemoveAll(l => !l.Name.Contains(SmallRNAConsts.rRNADB_KEY)); } Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } var summaryFile = options.OutputFile + ".info"; Progress.SetMessage("Writing summary to " + summaryFile + "..."); using (var sw = new StreamWriter(summaryFile)) { sw.WriteLine("Biotype\tCount"); all.ConvertAll(m => m.Name).Distinct().GroupBy(m => m.StringBefore(":")).OrderByDescending(m => m.Count()).ToList().ForEach(m => sw.WriteLine("{0}\t{1}", m.Key, m.Count())); } var result = new List <string>(new[] { options.OutputFile }); var fasta = Path.ChangeExtension(options.OutputFile, ".fasta"); if ((File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile)) || File.Exists(options.RRNAFile)) { result.Add(fasta); using (var sw = new StreamWriter(fasta)) { string line; using (var sr = new StreamReader(options.FastaFile)) { while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } if (File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile)) { using (var sr = new StreamReader(options.UcscMatureTrnaFastaFile)) { while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } if (File.Exists(options.RRNAFile)) { using (var sr = new StreamReader(options.RRNAFile)) { while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } } } var faFile = options.OutputFile + ".fa"; Progress.SetMessage("Extracting sequence from " + options.FastaFile + "..."); var b2foptions = new Bed2FastaProcessorOptions() { GenomeFastaFile = options.FastaFile, InputFile = options.OutputFile, OutputFile = faFile, KeepChrInName = false, }; if (!File.Exists(options.UcscMatureTrnaFastaFile)) { b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA) || m.StartsWith(SmallRNAConsts.tRNA); } else { b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA); } new Bed2FastaProcessor(b2foptions) { Progress = this.Progress }.Process(); if (File.Exists(options.UcscMatureTrnaFastaFile)) { Progress.SetMessage("Extracting sequence from " + options.UcscMatureTrnaFastaFile + " ..."); using (var sw = new StreamWriter(faFile, true)) { foreach (var tRNA in trnas) { if (!string.IsNullOrEmpty(tRNA.Sequence)) { sw.WriteLine(">{0}", tRNA.Name); sw.WriteLine("{0}", tRNA.Sequence); } } } } return(result); }
/// <summary> /// Get 1-based coordinate from file. Bed format will be automatically translated. /// </summary> /// <param name="coordinateFile">source file</param> /// <param name="gtfFeature">if it's gtf format, which feature name will be used as gene_id</param> /// <param name="bedAsGtf">if bed already be 1-based</param> /// <returns></returns> public static List <GtfItem> GetSequenceRegions(string coordinateFile, string gtfFeature = "", bool bedAsGtf = false) { bool isBedFormat = IsBedFormat(coordinateFile); List <GtfItem> result; if (isBedFormat) { //bed is zero-based, and the end is not included in the sequence region //https://genome.ucsc.edu/FAQ/FAQformat.html#format1 //gtf is 1-based, and the end is included in the sequence region //http://useast.ensembl.org/info/website/upload/gff.html //since pos in sam format is 1-based, we need to convert beditem to gtfitem. //http://samtools.sourceforge.net/SAMv1.pdf var bedItems = new BedItemFile <BedItem>().ReadFromFile(coordinateFile); if (!bedAsGtf) { bedItems.ForEach(m => m.Start++); } result = bedItems.ConvertAll(m => new GtfItem(m)); } else { result = GtfItemFile.ReadFromFile(coordinateFile).ToList(); if (!string.IsNullOrEmpty(gtfFeature)) { result.RemoveAll(m => !m.Feature.Equals(gtfFeature)); } result.ForEach(m => { if (m.Attributes.Contains("gene_id \"")) { m.GeneId = m.Attributes.StringAfter("gene_id \"").StringBefore("\""); } else if (m.Attributes.Contains("ID=")) { m.GeneId = m.Attributes.StringAfter("ID=").StringBefore(";"); } if (m.Attributes.Contains("gene_name \"")) { m.Name = m.Attributes.StringAfter("gene_name \"").StringBefore("\""); } else if (m.Attributes.Contains("Name=")) { m.Name = m.Attributes.StringAfter("Name=").StringBefore(";"); } if (string.IsNullOrEmpty(m.GeneId) && !string.IsNullOrEmpty(m.Name)) { m.GeneId = m.Name; } if (!string.IsNullOrEmpty(m.GeneId) && string.IsNullOrEmpty(m.Name)) { m.Name = m.GeneId; } if (string.IsNullOrEmpty(m.GeneId)) { m.GeneId = m.Attributes; m.Name = m.Attributes; } }); } return(result); }