public virtual void Close()
 {
   if (file != null)
   {
     file.Close();
     file = null;
   }
 }
Beispiel #2
0
 public virtual void Open(string filename)
 {
     if (file == null)
     {
         file = new GtfItemFile();
     }
     file.Open(filename);
 }
 public virtual void Open(string filename)
 {
   if (file == null)
   {
     file = new GtfItemFile();
   }
   file.Open(filename);
 }
Beispiel #4
0
 public virtual void Close()
 {
     if (file != null)
     {
         file.Close();
         file = null;
     }
 }
        public Dictionary <string, BedItem> ReadBedItems()
        {
            Dictionary <string, BedItem> map = new Dictionary <string, BedItem>();

            using (var gtf = new GtfItemFile(options.InputFile))
            {
                GtfItem item;
                int     count = 0;
                while ((item = gtf.Next()) != null)
                {
                    count++;
                    if ((count % 100000) == 0)
                    {
                        Progress.SetMessage("{0} gtf item processed", count);
                    }

                    BedItem loc;
                    string  name;
                    if (options.ByName)
                    {
                        name = item.Name;
                        if (string.IsNullOrWhiteSpace(name)) //maybe in gff3 format
                        {
                            //Console.WriteLine(item.Attributes);
                            name = item.Attributes.StringAfter("Name=").StringBefore(";");
                        }
                    }
                    else
                    {
                        name = item.GeneId;
                        if (string.IsNullOrWhiteSpace(name)) //maybe in gff3 format
                        {
                            name = item.Attributes.StringAfter("ID=").StringBefore(";");
                        }
                    }

                    if (!map.TryGetValue(name, out loc))
                    {
                        loc         = new BedItem();
                        loc.Name    = name;
                        loc.Seqname = item.Seqname;
                        loc.Start   = item.Start;
                        loc.End     = item.End;
                        loc.Strand  = item.Strand;
                        map[name]   = loc;
                        continue;
                    }

                    map[name].UnionWith(item);
                }
            }

            map.Values.ToList().ForEach(m => m.Start--);
            return(map);
        }
Beispiel #6
0
 public static List<GtfItem> ReadFromFile(string filename)
 {
   var result = new List<GtfItem>();
   using (var f = new GtfItemFile(filename))
   {
     GtfItem item;
     while ((item = f.Next()) != null)
     {
       result.Add(item);
     }
   }
   return result;
 }
        public static List <GtfItem> ReadFromFile(string filename, string featureName)
        {
            var result = new List <GtfItem>();

            using (var f = new GtfItemFile(filename))
            {
                GtfItem item;
                while ((item = f.Next(featureName)) != null)
                {
                    result.Add(item);
                }
            }
            return(result);
        }
Beispiel #8
0
 public void TestNextExon()
 {
   using (GtfItemFile file = new GtfItemFile(filename))
   {
     GtfItem item = file.NextExon();
     Assert.AreEqual("GL000213.1", item.Seqname);
     Assert.AreEqual("protein_coding", item.Source);
     Assert.AreEqual("exon", item.Feature);
     Assert.AreEqual(138767, item.Start);
     Assert.AreEqual(139339, item.End);
     Assert.AreEqual(".", item.Score);
     Assert.AreEqual('-', item.Strand);
     Assert.AreEqual('.', item.Frame);
     Assert.AreEqual(" gene_id \"ENSG00000237375\"; transcript_id \"ENST00000327822\"; exon_number \"1\"; gene_name \"BX072566.1\"; gene_biotype \"protein_coding\"; transcript_name \"BX072566.1-201\";", item.Attributes);
   }
 }
Beispiel #9
0
 public void TestNext()
 {
   using (GtfItemFile file = new GtfItemFile(filename))
   {
     GtfItem item = file.Next();
     Assert.AreEqual("GL000213.1", item.Seqname);
     Assert.AreEqual("miRNA", item.Source);
     Assert.AreEqual("CDS", item.Feature);
     Assert.AreEqual(104742, item.Start);
     Assert.AreEqual(104817, item.End);
     Assert.AreEqual(".", item.Score);
     Assert.AreEqual('+', item.Strand);
     Assert.AreEqual('.', item.Frame);
     Assert.AreEqual(" gene_id \"ENSG00000265283\"; transcript_id \"ENST00000578976\"; exon_number \"1\"; gene_name \"MIR3118-5\"; gene_biotype \"miRNA\"; transcript_name \"MIR3118-5-201\";", item.Attributes);
   }
 }
    public Dictionary<string, BedItem> ReadBedItems()
    {
      Dictionary<string, BedItem> map = new Dictionary<string, BedItem>();

      using (var gtf = new GtfItemFile(options.InputFile))
      {
        GtfItem item;
        int count = 0;
        while ((item = gtf.Next()) != null)
        {
          count++;
          if ((count % 100000) == 0)
          {
            Progress.SetMessage("{0} gtf item processed", count);
          }

          BedItem loc;
          if (!map.TryGetValue(item.GeneId, out loc))
          {
            loc = new BedItem();
            loc.Name = item.GeneId;
            loc.Seqname = item.Seqname;
            loc.Start = item.Start;
            loc.End = item.End;
            loc.Strand = item.Strand;
            map[item.GeneId] = loc;
            continue;
          }

          map[item.GeneId].UnionWith(item);
        }
      }

      map.Values.ToList().ForEach(m => m.Start--);
      return map;
    }
    public override IEnumerable<string> Process()
    {
      var paramFile = options.OutputFile + ".param";
      if (string.IsNullOrEmpty(options.ParamFile) || !Path.GetFullPath(options.ParamFile).Equals(Path.GetFullPath(paramFile)))
      {
        options.SaveToFile(options.OutputFile + ".param");
      }

      var bedfile = new BedItemFile<BedItem>(6);

      var mirnas = new List<BedItem>();
      if (File.Exists(options.MiRBaseFile))
      {
        Progress.SetMessage("Processing {0} ...", options.MiRBaseFile);

        if (options.MiRBaseFile.EndsWith(".bed"))
        {
          mirnas = bedfile.ReadFromFile(options.MiRBaseFile);
          mirnas.ForEach(m =>
          {
            m.Seqname = m.Seqname.StringAfter("chr");
            m.Name = options.MiRBaseKey + ":" + m.Name;
          });
        }
        else
        {
          using (var gf = new GtfItemFile(options.MiRBaseFile))
          {
            GtfItem item;
            while ((item = gf.Next(options.MiRBaseKey)) != null)
            {
              BedItem loc = new BedItem();
              loc.Seqname = item.Seqname.StringAfter("chr");
              loc.Start = item.Start - 1;
              loc.End = item.End;
              loc.Name = options.MiRBaseKey + ":" + item.Attributes.StringAfter("Name=").StringBefore(";");
              loc.Score = 1000;
              loc.Strand = item.Strand;
              mirnas.Add(loc);
            }
          }
        }

        Progress.SetMessage("{0} miRNA read.", mirnas.Count);
      }

      List<BedItem> trnas = new List<BedItem>();
      if (File.Exists(options.UcscTrnaFile))
      {
        //reading tRNA from ucsc table without mitocondrom tRNA
        Progress.SetMessage("Processing {0} ...", options.UcscTrnaFile);
        trnas = bedfile.ReadFromFile(options.UcscTrnaFile);
        trnas.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr"));

        //remove the tRNA not from 1-22, X and Y
        trnas.RemoveAll(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n)));

        //mitocondrom tRNA will be extracted from ensembl gtf file
        trnas.RemoveAll(m => m.Seqname.Equals("M") || m.Seqname.Equals("MT"));

        trnas.ForEach(m => m.Name = SmallRNAConsts.tRNA + ":" + m.Name);

        Progress.SetMessage("{0} tRNA from ucsc read.", trnas.Count);
      }

      var others = new List<BedItem>();
      if (File.Exists(options.EnsemblGtfFile))
      {
        //reading smallRNA/tRNA from ensembl gtf file
        Progress.SetMessage("Processing {0} ...", options.EnsemblGtfFile);
        using (var gf = new GtfItemFile(options.EnsemblGtfFile))
        {
          var biotypes = new HashSet<string>(SmallRNAConsts.Biotypes);
          biotypes.Remove(SmallRNAConsts.miRNA);

          GtfItem item;
          int count = 0;
          while ((item = gf.Next("gene")) != null)
          {
            string biotype;
            if (item.Attributes.Contains("gene_biotype"))
            {
              biotype = item.Attributes.StringAfter("gene_biotype \"").StringBefore("\"");
            }
            else if (item.Attributes.Contains("gene_type"))
            {
              biotype = item.Attributes.StringAfter("gene_type \"").StringBefore("\"");
            }
            else
            {
              continue;
            }

            if (File.Exists(options.UcscTrnaFile) && biotype.Equals(SmallRNAConsts.tRNA))
            {
              continue;
            }

            if (biotype.Equals("Mt_tRNA"))
            {
              count++;
              var gene_name = item.Attributes.Contains("gene_name") ? item.Attributes.StringAfter("gene_name \"").StringBefore("\"") : item.GeneId;
              BedItem loc = new BedItem();
              loc.Seqname = "MT";
              loc.Start = item.Start - 1;
              loc.End = item.End;
              loc.Name = string.Format(SmallRNAConsts.tRNA + ":chrMT.tRNA{0}-{1}", count, gene_name.StringAfter("-"));
              loc.Score = 1000;
              loc.Strand = item.Strand;
              trnas.Add(loc);
            }
            else if (biotypes.Contains(biotype))
            {
              string seqName;
              if (item.Seqname.ToLower().StartsWith("chr"))
              {
                seqName = item.Seqname.Substring(3);
              }
              else
              {
                seqName = item.Seqname;
              }
              if (seqName.Equals("M"))
              {
                seqName = "MT";
              }

              //ignore all smallRNA coordinates on scaffold or contig.
              if (seqName.Length > 5)
              {
                continue;
              }

              var gene_name = item.Attributes.StringAfter("gene_name \"").StringBefore("\"");

              BedItem loc = new BedItem();
              loc.Seqname = seqName;
              loc.Start = item.Start - 1;
              loc.End = item.End;
              loc.Name = biotype + ":" + gene_name + ":" + item.GeneId;
              loc.Score = 1000;
              loc.Strand = item.Strand;
              others.Add(loc);
            }
          }
        }
      }

      var all = new List<BedItem>();
      all.AddRange(mirnas);
      all.AddRange(trnas);
      all.AddRange(others);

      if (File.Exists(options.RRNAFile))
      {
        var seqs = SequenceUtils.Read(options.RRNAFile);
        foreach(var seq in seqs)
        {
          all.Add(new BedItem()
          {
            Seqname = seq.Name,
            Start = 0,
            End = seq.SeqString.Length,
            Strand = '+',
            Name = "rRNA:" + seq.Name
          });
        }
      }

      Progress.SetMessage("Saving smallRNA coordinates to " + options.OutputFile + "...");
      using (var sw = new StreamWriter(options.OutputFile))
      {
        foreach (var pir in SmallRNAConsts.Biotypes)
        {
          var locs = all.Where(m => m.Name.StartsWith(pir)).ToList();

          GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start);

          foreach (var loc in locs)
          {
            sw.WriteLine(bedfile.GetValue(loc));
          }
        }
      }

      Progress.SetMessage("Extracting sequence from " + options.FastaFile + "...");
      new Bed2FastaProcessor(new Bed2FastaProcessorOptions()
      {
        GenomeFastaFile = options.FastaFile,
        InputFile = options.OutputFile,
        OutputFile = options.OutputFile + ".fa",
        KeepChrInName = false,
        AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.tRNA),
      })
      {
        Progress = this.Progress
      }.Process();

      var summaryFile = options.OutputFile + ".info";
      Progress.SetMessage("Writing summary to " + summaryFile + "...");
      using (var sw = new StreamWriter(summaryFile))
      {
        sw.WriteLine("Biotype\tCount");

        all.ConvertAll(m => m.Name).Distinct().GroupBy(m => m.StringBefore(":")).OrderByDescending(m => m.Count()).ToList().ForEach(m => sw.WriteLine("{0}\t{1}", m.Key, m.Count()));
      }

      return new string[] { options.OutputFile };
    }
 public GtfTranscriptItemFile()
 {
   file = null;
   last = null;
 }
Beispiel #13
0
    public List<GtfItem> ReadGtfItems()
    {
      List<GtfItem> result = new List<GtfItem>();

      using (var gtf = new GtfItemFile(options.InputFile))
      {
        GtfItem item;
        int count = 0;
        while ((item = gtf.Next()) != null)
        {
          count++;
          if ((count % 100000) == 0)
          {
            Progress.SetMessage("{0} gtf item processed", count);
          }

          item.Name = item.Attributes.StringAfter("locus_tag=").StringBefore(";");
          result.Add(item);
        }
      }

      return result;
    }
    public override IEnumerable<string> Process()
    {
      Dictionary<string, List<GtfItem>> map = new Dictionary<string, List<GtfItem>>();

      var namemap = new Dictionary<string, string>();
      if (File.Exists(options.MapFile))
      {
        namemap = new MapReader(0, 1, hasHeader: false).ReadFromFile(options.MapFile);
      }

      using (var gtf = new GtfItemFile(options.InputFile))
      {
        GtfItem item;
        int count = 0;
        while ((item = gtf.Next()) != null)
        {
          count++;
          if ((count % 100000) == 0)
          {
            Progress.SetMessage("{0} gtf item processed", count);
          }
          List<GtfItem> oldItems;
          if (!map.TryGetValue(item.GeneId, out oldItems))
          {
            map[item.GeneId] = new[] { item }.ToList();
          }
          else
          {
            if (IsExon(item))
            {
              oldItems.RemoveAll(m => !IsExon(m));
              oldItems.Add(item);
            }
            else
            {
              if (oldItems.All(m => !IsExon(m)))
              {
                oldItems.Add(item);
              }
            }
          }
        }
      }

      //      map[item.GeneId] = item.Attributes.StringAfter("gene_name \"").StringBefore("\"");
      var keys = (from key in map.Keys
                  orderby key
                  select key).ToList();

      using (StreamWriter sw = new StreamWriter(options.OutputFile))
      {
        bool bHasGeneName = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_name")));
        if (!bHasGeneName  && !File.Exists(options.MapFile))
        {
          throw new Exception(string.Format("No gene_name found in {0} and no id/name map file defined.", options.InputFile));
        }

        bool bHasGeneBiotype = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_biotype")));
        if (bHasGeneBiotype)
        {
          sw.WriteLine("gene_id\tgene_name\tlength\tgene_biotype");
        }
        else
        {
          sw.WriteLine("gene_id\tgene_name\tlength");
        }

        foreach (var key in keys)
        {
          var gtfs = map[key];
          string name;
          var gtf = gtfs.FirstOrDefault(m => m.Attributes.Contains("gene_name"));
          gtfs.CombineCoordinates();
          string biotype;
          if (gtf == null)
          {
            biotype = string.Empty;
            if (!namemap.TryGetValue(key, out name))
            {
              name = key;
            }
          }
          else
          {
            biotype = gtf.GetBiotype();
            name = gtf.Attributes.StringAfter("gene_name \"").StringBefore("\"");
          }

          if (bHasGeneBiotype)
          {
            sw.WriteLine("{0}\t{1}\t{2}\t{3}", key, name, gtfs.Sum(m => m.Length), biotype);
          }
          else
          {
            sw.WriteLine("{0}\t{1}\t{2}", key, name, gtfs.Sum(m => m.Length));
          }
        }
      }

      return new string[] { options.OutputFile };
    }
Beispiel #15
0
 public GtfTranscriptItemFile()
 {
     file = null;
     last = null;
 }
Beispiel #16
0
        public override IEnumerable <string> Process()
        {
            Dictionary <string, List <GtfItem> > map = new Dictionary <string, List <GtfItem> >();

            var namemap = new Dictionary <string, string>();

            if (File.Exists(options.MapFile))
            {
                namemap = new MapReader(0, 1, hasHeader: false).ReadFromFile(options.MapFile);
            }

            using (var gtf = new GtfItemFile(options.InputFile))
            {
                GtfItem item;
                int     count = 0;
                while ((item = gtf.Next()) != null)
                {
                    count++;
                    if ((count % 100000) == 0)
                    {
                        Progress.SetMessage("{0} gtf item processed", count);
                    }
                    List <GtfItem> oldItems;

                    var id = string.IsNullOrEmpty(options.Key) ? item.GeneId : item.GetAttribute(options.Key);

                    if (!map.TryGetValue(id, out oldItems))
                    {
                        map[id] = new[] { item }.ToList();
                    }
                    else
                    {
                        if (IsExon(item))
                        {
                            oldItems.RemoveAll(m => !IsExon(m));
                            oldItems.Add(item);
                        }
                        else
                        {
                            if (oldItems.All(m => !IsExon(m)))
                            {
                                oldItems.Add(item);
                            }
                        }
                    }
                }
            }

            //      map[item.GeneId] = item.Attributes.StringAfter("gene_name \"").StringBefore("\"");
            var keys = (from key in map.Keys
                        orderby key
                        select key).ToList();

            using (StreamWriter sw = new StreamWriter(options.OutputFile))
                using (StreamWriter swBed = new StreamWriter(options.OutputFile + ".bed"))
                {
                    bool bHasGeneName = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_name")));
                    if (!bHasGeneName && !File.Exists(options.MapFile))
                    {
                        throw new Exception(string.Format("No gene_name found in {0} and no id/name map file defined.", options.InputFile));
                    }

                    sw.Write("gene_id\tgene_name\tlength\tchr\tstart\tend");
                    bool bHasGeneBiotype = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_biotype")));
                    bool bHasGeneType    = map.Values.Any(l => l.Any(m => m.Attributes.Contains("gene_type")));
                    if (bHasGeneBiotype || bHasGeneType)
                    {
                        sw.Write("\tgene_biotype");
                    }
                    sw.WriteLine();

                    foreach (var key in keys)
                    {
                        var    gtfs = map[key];
                        string name;
                        var    gtf = gtfs.FirstOrDefault(m => m.Attributes.Contains("gene_name"));
                        gtfs.CombineCoordinates();
                        string biotype;
                        if (gtf == null)
                        {
                            biotype = string.Empty;
                            if (!namemap.TryGetValue(key, out name))
                            {
                                name = key;
                            }
                        }
                        else
                        {
                            biotype = gtf.GetBiotype();
                            name    = gtf.Attributes.StringAfter("gene_name \"").StringBefore("\"");
                        }

                        sw.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}", key, name, gtfs.Sum(m => m.Length), gtfs.First().Seqname, gtfs.Min(l => l.Start), gtfs.Max(l => l.End));
                        if (bHasGeneBiotype || bHasGeneType)
                        {
                            sw.Write("\t{0}", biotype);
                        }
                        sw.WriteLine();
                        swBed.WriteLine("{0}\t{1}\t{2}\t{3}_{4}", gtfs.First().Seqname, gtfs.Min(l => l.Start), gtfs.Max(l => l.End), key.StringBefore("."), name);
                    }
                }

            return(new string[] { options.OutputFile });
        }