public override IEnumerable <string> Process() { Progress.SetMessage("reading fasta file ..."); var faMap = SequenceUtils.Read(new FastaFormat(), options.FastaFile).ToDictionary(m => m.Name); Progress.SetMessage("{0} sequences read ...", faMap.Count); using (StreamWriter sw = new StreamWriter(options.OutputFile)) { Progress.SetMessage("reading gff file ..."); var gffs = GtfItemFile.ReadFromFile(options.GffFile); } return(new string[] { options.OutputFile }); }
public GtfDistanceExporter(string gtfFile, string gtfKey = "exon") { Console.WriteLine("reading gtf file " + gtfFile + " ..."); this.maps = CollectionUtils.ToGroupDictionary(GtfItemFile.ReadFromFile(gtfFile, gtfKey), m => m.Seqname.StringAfter("chr")); Console.WriteLine("reading gtf file " + gtfFile + " done"); this.header = string.Format("distance_{0}\tdistance_{0}_position\tdistance_gene\tdistance_in_gene", gtfKey); this.emptyStr = new String('\t', header.Count(m => m == '\t')); //sort the gtf items by locus foreach (var lst in maps.Values) { lst.Sort((m1, m2) => m1.Start.CompareTo(m2.Start)); } }
/// <summary> /// Get 1-based coordinate from file. Bed format will be automatically translated. /// </summary> /// <param name="coordinateFile">source file</param> /// <param name="gtfFeature">if it's gtf format, which feature name will be used as gene_id</param> /// <param name="bedAsGtf">if bed already be 1-based</param> /// <returns></returns> public static List <GtfItem> GetSequenceRegions(string coordinateFile, string gtfFeature = "", bool bedAsGtf = false) { bool isBedFormat = IsBedFormat(coordinateFile); List <GtfItem> result; if (isBedFormat) { //bed is zero-based, and the end is not included in the sequence region //https://genome.ucsc.edu/FAQ/FAQformat.html#format1 //gtf is 1-based, and the end is included in the sequence region //http://useast.ensembl.org/info/website/upload/gff.html //since pos in sam format is 1-based, we need to convert beditem to gtfitem. //http://samtools.sourceforge.net/SAMv1.pdf var bedItems = new BedItemFile <BedItem>().ReadFromFile(coordinateFile); if (!bedAsGtf) { bedItems.ForEach(m => m.Start++); } result = bedItems.ConvertAll(m => new GtfItem(m)); } else { result = GtfItemFile.ReadFromFile(coordinateFile).ToList(); if (!string.IsNullOrEmpty(gtfFeature)) { result.RemoveAll(m => !m.Feature.Equals(gtfFeature)); } result.ForEach(m => { if (m.Attributes.Contains("gene_id \"")) { m.GeneId = m.Attributes.StringAfter("gene_id \"").StringBefore("\""); } else if (m.Attributes.Contains("ID=")) { m.GeneId = m.Attributes.StringAfter("ID=").StringBefore(";"); } if (m.Attributes.Contains("gene_name \"")) { m.Name = m.Attributes.StringAfter("gene_name \"").StringBefore("\""); } else if (m.Attributes.Contains("Name=")) { m.Name = m.Attributes.StringAfter("Name=").StringBefore(";"); } if (string.IsNullOrEmpty(m.GeneId) && !string.IsNullOrEmpty(m.Name)) { m.GeneId = m.Name; } if (!string.IsNullOrEmpty(m.GeneId) && string.IsNullOrEmpty(m.Name)) { m.Name = m.GeneId; } if (string.IsNullOrEmpty(m.GeneId)) { m.GeneId = m.Attributes; m.Name = m.Attributes; } }); } return(result); }