public virtual void Close() { if (file != null) { file.Close(); file = null; } }
public virtual void Open(string filename) { if (file == null) { file = new GtfItemFile(); } file.Open(filename); }
public Dictionary <string, BedItem> ReadBedItems() { Dictionary <string, BedItem> map = new Dictionary <string, BedItem>(); using (var gtf = new GtfItemFile(options.InputFile)) { GtfItem item; int count = 0; while ((item = gtf.Next()) != null) { count++; if ((count % 100000) == 0) { Progress.SetMessage("{0} gtf item processed", count); } BedItem loc; string name; if (options.ByName) { name = item.Name; if (string.IsNullOrWhiteSpace(name)) //maybe in gff3 format { //Console.WriteLine(item.Attributes); name = item.Attributes.StringAfter("Name=").StringBefore(";"); } } else { name = item.GeneId; if (string.IsNullOrWhiteSpace(name)) //maybe in gff3 format { name = item.Attributes.StringAfter("ID=").StringBefore(";"); } } if (!map.TryGetValue(name, out loc)) { loc = new BedItem(); loc.Name = name; loc.Seqname = item.Seqname; loc.Start = item.Start; loc.End = item.End; loc.Strand = item.Strand; map[name] = loc; continue; } map[name].UnionWith(item); } } map.Values.ToList().ForEach(m => m.Start--); return(map); }
public static List<GtfItem> ReadFromFile(string filename) { var result = new List<GtfItem>(); using (var f = new GtfItemFile(filename)) { GtfItem item; while ((item = f.Next()) != null) { result.Add(item); } } return result; }
public static List <GtfItem> ReadFromFile(string filename, string featureName) { var result = new List <GtfItem>(); using (var f = new GtfItemFile(filename)) { GtfItem item; while ((item = f.Next(featureName)) != null) { result.Add(item); } } return(result); }
public void TestNextExon() { using (GtfItemFile file = new GtfItemFile(filename)) { GtfItem item = file.NextExon(); Assert.AreEqual("GL000213.1", item.Seqname); Assert.AreEqual("protein_coding", item.Source); Assert.AreEqual("exon", item.Feature); Assert.AreEqual(138767, item.Start); Assert.AreEqual(139339, item.End); Assert.AreEqual(".", item.Score); Assert.AreEqual('-', item.Strand); Assert.AreEqual('.', item.Frame); Assert.AreEqual(" gene_id \"ENSG00000237375\"; transcript_id \"ENST00000327822\"; exon_number \"1\"; gene_name \"BX072566.1\"; gene_biotype \"protein_coding\"; transcript_name \"BX072566.1-201\";", item.Attributes); } }
public void TestNext() { using (GtfItemFile file = new GtfItemFile(filename)) { GtfItem item = file.Next(); Assert.AreEqual("GL000213.1", item.Seqname); Assert.AreEqual("miRNA", item.Source); Assert.AreEqual("CDS", item.Feature); Assert.AreEqual(104742, item.Start); Assert.AreEqual(104817, item.End); Assert.AreEqual(".", item.Score); Assert.AreEqual('+', item.Strand); Assert.AreEqual('.', item.Frame); Assert.AreEqual(" gene_id \"ENSG00000265283\"; transcript_id \"ENST00000578976\"; exon_number \"1\"; gene_name \"MIR3118-5\"; gene_biotype \"miRNA\"; transcript_name \"MIR3118-5-201\";", item.Attributes); } }
public Dictionary<string, BedItem> ReadBedItems() { Dictionary<string, BedItem> map = new Dictionary<string, BedItem>(); using (var gtf = new GtfItemFile(options.InputFile)) { GtfItem item; int count = 0; while ((item = gtf.Next()) != null) { count++; if ((count % 100000) == 0) { Progress.SetMessage("{0} gtf item processed", count); } BedItem loc; if (!map.TryGetValue(item.GeneId, out loc)) { loc = new BedItem(); loc.Name = item.GeneId; loc.Seqname = item.Seqname; loc.Start = item.Start; loc.End = item.End; loc.Strand = item.Strand; map[item.GeneId] = loc; continue; } map[item.GeneId].UnionWith(item); } } map.Values.ToList().ForEach(m => m.Start--); return map; }
public override IEnumerable<string> Process() { var paramFile = options.OutputFile + ".param"; if (string.IsNullOrEmpty(options.ParamFile) || !Path.GetFullPath(options.ParamFile).Equals(Path.GetFullPath(paramFile))) { options.SaveToFile(options.OutputFile + ".param"); } var bedfile = new BedItemFile<BedItem>(6); var mirnas = new List<BedItem>(); if (File.Exists(options.MiRBaseFile)) { Progress.SetMessage("Processing {0} ...", options.MiRBaseFile); if (options.MiRBaseFile.EndsWith(".bed")) { mirnas = bedfile.ReadFromFile(options.MiRBaseFile); mirnas.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); m.Name = options.MiRBaseKey + ":" + m.Name; }); } else { using (var gf = new GtfItemFile(options.MiRBaseFile)) { GtfItem item; while ((item = gf.Next(options.MiRBaseKey)) != null) { BedItem loc = new BedItem(); loc.Seqname = item.Seqname.StringAfter("chr"); loc.Start = item.Start - 1; loc.End = item.End; loc.Name = options.MiRBaseKey + ":" + item.Attributes.StringAfter("Name=").StringBefore(";"); loc.Score = 1000; loc.Strand = item.Strand; mirnas.Add(loc); } } } Progress.SetMessage("{0} miRNA read.", mirnas.Count); } List<BedItem> trnas = new List<BedItem>(); if (File.Exists(options.UcscTrnaFile)) { //reading tRNA from ucsc table without mitocondrom tRNA Progress.SetMessage("Processing {0} ...", options.UcscTrnaFile); trnas = bedfile.ReadFromFile(options.UcscTrnaFile); trnas.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr")); //remove the tRNA not from 1-22, X and Y trnas.RemoveAll(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))); //mitocondrom tRNA will be extracted from ensembl gtf file trnas.RemoveAll(m => m.Seqname.Equals("M") || m.Seqname.Equals("MT")); trnas.ForEach(m => m.Name = SmallRNAConsts.tRNA + ":" + m.Name); Progress.SetMessage("{0} tRNA from ucsc read.", trnas.Count); } var others = new List<BedItem>(); if (File.Exists(options.EnsemblGtfFile)) { //reading smallRNA/tRNA from ensembl gtf file Progress.SetMessage("Processing {0} ...", options.EnsemblGtfFile); using (var gf = new GtfItemFile(options.EnsemblGtfFile)) { var biotypes = new HashSet<string>(SmallRNAConsts.Biotypes); biotypes.Remove(SmallRNAConsts.miRNA); GtfItem item; int count = 0; while ((item = gf.Next("gene")) != null) { string biotype; if (item.Attributes.Contains("gene_biotype")) { biotype = item.Attributes.StringAfter("gene_biotype \"").StringBefore("\""); } else if (item.Attributes.Contains("gene_type")) { biotype = item.Attributes.StringAfter("gene_type \"").StringBefore("\""); } else { continue; } if (File.Exists(options.UcscTrnaFile) && biotype.Equals(SmallRNAConsts.tRNA)) { continue; } if (biotype.Equals("Mt_tRNA")) { count++; var gene_name = item.Attributes.Contains("gene_name") ? item.Attributes.StringAfter("gene_name \"").StringBefore("\"") : item.GeneId; BedItem loc = new BedItem(); loc.Seqname = "MT"; loc.Start = item.Start - 1; loc.End = item.End; loc.Name = string.Format(SmallRNAConsts.tRNA + ":chrMT.tRNA{0}-{1}", count, gene_name.StringAfter("-")); loc.Score = 1000; loc.Strand = item.Strand; trnas.Add(loc); } else if (biotypes.Contains(biotype)) { string seqName; if (item.Seqname.ToLower().StartsWith("chr")) { seqName = item.Seqname.Substring(3); } else { seqName = item.Seqname; } if (seqName.Equals("M")) { seqName = "MT"; } //ignore all smallRNA coordinates on scaffold or contig. if (seqName.Length > 5) { continue; } var gene_name = item.Attributes.StringAfter("gene_name \"").StringBefore("\""); BedItem loc = new BedItem(); loc.Seqname = seqName; loc.Start = item.Start - 1; loc.End = item.End; loc.Name = biotype + ":" + gene_name + ":" + item.GeneId; loc.Score = 1000; loc.Strand = item.Strand; others.Add(loc); } } } } var all = new List<BedItem>(); all.AddRange(mirnas); all.AddRange(trnas); all.AddRange(others); if (File.Exists(options.RRNAFile)) { var seqs = SequenceUtils.Read(options.RRNAFile); foreach(var seq in seqs) { all.Add(new BedItem() { Seqname = seq.Name, Start = 0, End = seq.SeqString.Length, Strand = '+', Name = "rRNA:" + seq.Name }); } } Progress.SetMessage("Saving smallRNA coordinates to " + options.OutputFile + "..."); using (var sw = new StreamWriter(options.OutputFile)) { foreach (var pir in SmallRNAConsts.Biotypes) { var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } Progress.SetMessage("Extracting sequence from " + options.FastaFile + "..."); new Bed2FastaProcessor(new Bed2FastaProcessorOptions() { GenomeFastaFile = options.FastaFile, InputFile = options.OutputFile, OutputFile = options.OutputFile + ".fa", KeepChrInName = false, AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.tRNA), }) { Progress = this.Progress }.Process(); var summaryFile = options.OutputFile + ".info"; Progress.SetMessage("Writing summary to " + summaryFile + "..."); using (var sw = new StreamWriter(summaryFile)) { sw.WriteLine("Biotype\tCount"); all.ConvertAll(m => m.Name).Distinct().GroupBy(m => m.StringBefore(":")).OrderByDescending(m => m.Count()).ToList().ForEach(m => sw.WriteLine("{0}\t{1}", m.Key, m.Count())); } return new string[] { options.OutputFile }; }
public GtfTranscriptItemFile() { file = null; last = null; }
public List<GtfItem> ReadGtfItems() { List<GtfItem> result = new List<GtfItem>(); using (var gtf = new GtfItemFile(options.InputFile)) { GtfItem item; int count = 0; while ((item = gtf.Next()) != null) { count++; if ((count % 100000) == 0) { Progress.SetMessage("{0} gtf item processed", count); } item.Name = item.Attributes.StringAfter("locus_tag=").StringBefore(";"); result.Add(item); } } return result; }
public override IEnumerable<string> Process() { Dictionary<string, List<GtfItem>> map = new Dictionary<string, List<GtfItem>>(); var namemap = new Dictionary<string, string>(); if (File.Exists(options.MapFile)) { namemap = new MapReader(0, 1, hasHeader: false).ReadFromFile(options.MapFile); } using (var gtf = new GtfItemFile(options.InputFile)) { GtfItem item; int count = 0; while ((item = gtf.Next()) != null) { count++; if ((count % 100000) == 0) { Progress.SetMessage("{0} gtf item processed", count); } List<GtfItem> oldItems; if (!map.TryGetValue(item.GeneId, out oldItems)) { map[item.GeneId] = new[] { item }.ToList(); } else { if (IsExon(item)) { oldItems.RemoveAll(m => !IsExon(m)); oldItems.Add(item); } else { if (oldItems.All(m => !IsExon(m))) { oldItems.Add(item); } } } } } // map[item.GeneId] = item.Attributes.StringAfter("gene_name \"").StringBefore("\""); var keys = (from key in map.Keys orderby key select key).ToList(); using (StreamWriter sw = new StreamWriter(options.OutputFile)) { bool bHasGeneName = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_name"))); if (!bHasGeneName && !File.Exists(options.MapFile)) { throw new Exception(string.Format("No gene_name found in {0} and no id/name map file defined.", options.InputFile)); } bool bHasGeneBiotype = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_biotype"))); if (bHasGeneBiotype) { sw.WriteLine("gene_id\tgene_name\tlength\tgene_biotype"); } else { sw.WriteLine("gene_id\tgene_name\tlength"); } foreach (var key in keys) { var gtfs = map[key]; string name; var gtf = gtfs.FirstOrDefault(m => m.Attributes.Contains("gene_name")); gtfs.CombineCoordinates(); string biotype; if (gtf == null) { biotype = string.Empty; if (!namemap.TryGetValue(key, out name)) { name = key; } } else { biotype = gtf.GetBiotype(); name = gtf.Attributes.StringAfter("gene_name \"").StringBefore("\""); } if (bHasGeneBiotype) { sw.WriteLine("{0}\t{1}\t{2}\t{3}", key, name, gtfs.Sum(m => m.Length), biotype); } else { sw.WriteLine("{0}\t{1}\t{2}", key, name, gtfs.Sum(m => m.Length)); } } } return new string[] { options.OutputFile }; }
public override IEnumerable <string> Process() { Dictionary <string, List <GtfItem> > map = new Dictionary <string, List <GtfItem> >(); var namemap = new Dictionary <string, string>(); if (File.Exists(options.MapFile)) { namemap = new MapReader(0, 1, hasHeader: false).ReadFromFile(options.MapFile); } using (var gtf = new GtfItemFile(options.InputFile)) { GtfItem item; int count = 0; while ((item = gtf.Next()) != null) { count++; if ((count % 100000) == 0) { Progress.SetMessage("{0} gtf item processed", count); } List <GtfItem> oldItems; var id = string.IsNullOrEmpty(options.Key) ? item.GeneId : item.GetAttribute(options.Key); if (!map.TryGetValue(id, out oldItems)) { map[id] = new[] { item }.ToList(); } else { if (IsExon(item)) { oldItems.RemoveAll(m => !IsExon(m)); oldItems.Add(item); } else { if (oldItems.All(m => !IsExon(m))) { oldItems.Add(item); } } } } } // map[item.GeneId] = item.Attributes.StringAfter("gene_name \"").StringBefore("\""); var keys = (from key in map.Keys orderby key select key).ToList(); using (StreamWriter sw = new StreamWriter(options.OutputFile)) using (StreamWriter swBed = new StreamWriter(options.OutputFile + ".bed")) { bool bHasGeneName = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_name"))); if (!bHasGeneName && !File.Exists(options.MapFile)) { throw new Exception(string.Format("No gene_name found in {0} and no id/name map file defined.", options.InputFile)); } sw.Write("gene_id\tgene_name\tlength\tchr\tstart\tend"); bool bHasGeneBiotype = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_biotype"))); bool bHasGeneType = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_type"))); if (bHasGeneBiotype || bHasGeneType) { sw.Write("\tgene_biotype"); } sw.WriteLine(); foreach (var key in keys) { var gtfs = map[key]; string name; var gtf = gtfs.FirstOrDefault(m => m.Attributes.Contains("gene_name")); gtfs.CombineCoordinates(); string biotype; if (gtf == null) { biotype = string.Empty; if (!namemap.TryGetValue(key, out name)) { name = key; } } else { biotype = gtf.GetBiotype(); name = gtf.Attributes.StringAfter("gene_name \"").StringBefore("\""); } sw.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}", key, name, gtfs.Sum(m => m.Length), gtfs.First().Seqname, gtfs.Min(l => l.Start), gtfs.Max(l => l.End)); if (bHasGeneBiotype || bHasGeneType) { sw.Write("\t{0}", biotype); } sw.WriteLine(); swBed.WriteLine("{0}\t{1}\t{2}\t{3}_{4}", gtfs.First().Seqname, gtfs.Min(l => l.Start), gtfs.Max(l => l.End), key.StringBefore("."), name); } } return(new string[] { options.OutputFile }); }