예제 #1
0
        /// <summary>
        /// Transfer bed format (zero-based) to gff format (one-based)
        /// </summary>
        /// <param name="options"></param>
        /// <param name="progress"></param>
        /// <returns></returns>
        public static List <CoverageRegion> GetTargetCoverageRegionFromBed(ITargetBuilderOptions options, IProgressCallback progress)
        {
            var result = new List <CoverageRegion>();

            var groups = new BedItemFile <BedItem>().ReadFromFile(options.TargetFile);

            progress.SetMessage("Total {0} potential target group read from file {1}", groups.Count, options.TargetFile);

            foreach (var utr in groups)
            {
                var rg = new CoverageRegion();
                rg.Name    = utr.Name;
                rg.Seqname = utr.Seqname.StringAfter("chr");
                rg.Start   = utr.Start + 1;
                rg.End     = utr.End;
                rg.Strand  = utr.Strand;
                for (var i = rg.Start; i < rg.End; i++)
                {
                    rg.Coverages.Add(new CoverageSite(DEFAULT_COVERAGE));
                }
                result.Add(rg);
            }

            return(result);
        }
예제 #2
0
    public static SeedItem GetSeed(CoverageRegion cr, int offset, int seedLength, double minCoverage)
    {
      if (cr.Sequence.Length < offset + seedLength)
      {
        return null;
      }

      var coverage = cr.Coverages.Skip(offset).Take(seedLength).Average();
      if (coverage < minCoverage)
      {
        return null;
      }

      var newseq = cr.Sequence.Substring(offset, seedLength);
      var start = cr.Start + offset;
      var end = cr.Start + offset + seedLength - 1;
      if (cr.Strand == '+')
      {
        newseq = SequenceUtils.GetReverseComplementedSequence(newseq);
      }

      return new SeedItem()
      {
        Seqname = cr.Seqname,
        Start = start,
        End = end,
        Strand = cr.Strand,
        Coverage = coverage,
        Name = cr.Name,
        Sequence = newseq,
        Source = cr,
        SourceOffset = offset,
        GeneSymbol = cr.GeneSymbol
      };
    }
예제 #3
0
        public static List <SeedItem> FindLongestTarget(List <SeedItem> target, CoverageRegion t2c, string seq, int offset, int minimumSeedLength, int maximumSeedLength, double minimumCoverage)
        {
            var result    = ExtendToLongestTarget(target, t2c, seq, offset, minimumSeedLength, maximumSeedLength, minimumCoverage);
            var maxLength = result.Max(l => l.Length);

            result.RemoveAll(l => l.Length < maxLength);
            return(result);
        }
예제 #4
0
        public static List <CoverageRegion> GetSmallRNACoverageRegionFromXml(string featureFile, string[] includeSmallRNATags = null, string[] excudeSmallRNATags = null)
        {
            var smallRNAGroups = new FeatureItemGroupXmlFormat().ReadFromFile(featureFile);

            if (includeSmallRNATags != null && includeSmallRNATags.Length > 0)
            {
                smallRNAGroups.ForEach(m => m.RemoveAll(l => includeSmallRNATags.All(k => !m.Name.StartsWith(k))));
                smallRNAGroups.RemoveAll(m => m.Count == 0);
            }

            if (excudeSmallRNATags != null && excudeSmallRNATags.Length > 0)
            {
                smallRNAGroups.ForEach(m => m.RemoveAll(l => excudeSmallRNATags.Any(k => m.Name.StartsWith(k))));
                smallRNAGroups.RemoveAll(m => m.Count == 0);
            }

            var result = new List <CoverageRegion>();

            foreach (var sg in smallRNAGroups)
            {
                //since the items in same group shared same reads, only the first one will be used.
                var smallRNA = sg[0];
                smallRNA.Name = (from g in sg select g.Name).Merge("/");

                smallRNA.Locations.RemoveAll(m => m.SamLocations.Count == 0);
                smallRNA.CombineLocationByMappedReads();

                //only first location will be used.
                var loc = smallRNA.Locations[0];

                //coverage in all position will be set as same as total query count
                var rg = new CoverageRegion();
                rg.Name     = smallRNA.Name;
                rg.Seqname  = loc.Seqname;
                rg.Start    = loc.Start;
                rg.End      = loc.End;
                rg.Strand   = loc.Strand;
                rg.Sequence = loc.Sequence;

                var coverage   = (from sloc in loc.SamLocations select sloc.SamLocation.Parent.QueryCount).Sum();
                var uniqueRead = (from sloc in loc.SamLocations select sloc.SamLocation.Parent.Qname).Distinct().ToList();

                for (int i = 0; i < loc.Length; i++)
                {
                    rg.Coverages.Add(new CoverageSite(coverage, uniqueRead));
                }
                result.Add(rg);
            }
            return(result);
        }
예제 #5
0
    public static List<CoverageRegion> GetSmallRNACoverageRegion(string mappedFeatureXmlFile, string[] includeSmallRNATags = null, string[] excudeSmallRNATags = null)
    {
      var smallRNAGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedFeatureXmlFile);

      if (includeSmallRNATags != null && includeSmallRNATags.Length > 0)
      {
        smallRNAGroups.ForEach(m => m.RemoveAll(l => includeSmallRNATags.All(k => !m.Name.StartsWith(k))));
        smallRNAGroups.RemoveAll(m => m.Count == 0);
      }

      if (excudeSmallRNATags != null && excudeSmallRNATags.Length > 0)
      {
        smallRNAGroups.ForEach(m => m.RemoveAll(l => excudeSmallRNATags.Any(k => m.Name.StartsWith(k))));
        smallRNAGroups.RemoveAll(m => m.Count == 0);
      }

      var result = new List<CoverageRegion>();
      foreach (var sg in smallRNAGroups)
      {
        //since the items in same group shared same reads, only the first one will be used.
        var smallRNA = sg[0];
        smallRNA.Name = (from g in sg select g.Name).Merge("/");

        smallRNA.Locations.RemoveAll(m => m.SamLocations.Count == 0);
        smallRNA.CombineLocationByMappedReads();

        //only first location will be used.
        var loc = smallRNA.Locations[0];

        //coverage in all position will be set as same as total query count
        var rg = new CoverageRegion();
        rg.Name = smallRNA.Name;
        rg.Seqname = loc.Seqname;
        rg.Start = loc.Start;
        rg.End = loc.End;
        rg.Strand = loc.Strand;
        rg.Sequence = loc.Sequence;

        var coverage = (from sloc in loc.SamLocations select sloc.SamLocation.Parent.QueryCount).Sum();

        for (int i = 0; i < loc.Length; i++)
        {
          rg.Coverages.Add(coverage);
        }
        result.Add(rg);
      }
      return result;
    }
예제 #6
0
        public static SeedItem GetSeed(CoverageRegion cr, int offset, int seedLength, double minCoverage)
        {
            if (cr.Sequence.Length < offset + seedLength)
            {
                return(null);
            }

            var coverages = cr.Coverages.Skip(offset).Take(seedLength).ToList();
            var coverage  = coverages.Average(l => l.Coverage);

            if (coverage < minCoverage)
            {
                return(null);
            }

            var newseq = cr.Sequence.Substring(offset, seedLength);
            var start  = cr.Start + offset;
            var end    = cr.Start + offset + seedLength - 1;

            if (cr.Strand == '+')
            {
                newseq = SequenceUtils.GetReverseComplementedSequence(newseq);
            }

            return(new SeedItem()
            {
                Seqname = cr.Seqname,
                Start = start,
                End = end,
                Strand = cr.Strand,
                Coverage = coverage,
                Name = cr.Name,
                Sequence = newseq,
                Source = cr,
                SourceOffset = offset,
                GeneSymbol = cr.GeneSymbol
            });
        }
예제 #7
0
        public static List <CoverageRegion> GetSmallRNACoverageRegionFromFasta(string featureFile)
        {
            var sequences = SequenceUtils.Read(featureFile);
            var result    = new List <CoverageRegion>();

            foreach (var smallRNA in sequences)
            {
                //coverage in all position will be set as same as total query count
                var rg = new CoverageRegion();
                rg.Name     = smallRNA.Name;
                rg.Seqname  = "Unknown";
                rg.Start    = -1;
                rg.End      = -1;
                rg.Strand   = '*';
                rg.Sequence = smallRNA.SeqString;

                for (int i = 0; i < smallRNA.SeqString.Length; i++)
                {
                    rg.Coverages.Add(new CoverageSite(DEFAULT_COVERAGE));
                }
                result.Add(rg);
            }
            return(result);
        }
예제 #8
0
 public static List<SeedItem> FindLongestTarget(List<SeedItem> target, CoverageRegion t2c, string seq, int offset, int minimumSeedLength, int maximumSeedLength, double minimumCoverage)
 {
   var result = ExtendToLongestTarget(target, t2c, seq, offset, minimumSeedLength, maximumSeedLength, minimumCoverage);
   var maxLength = result.Max(l => l.Length);
   result.RemoveAll(l => l.Length < maxLength);
   return result;
 }
예제 #9
0
    public static List<SeedItem> ExtendToLongestTarget(List<SeedItem> target, CoverageRegion t2c, string seq, int offset, int minimumSeedLength, int maximumSeedLength, double minimumCoverage)
    {
      var individualSeeds = new List<SeedItem>();
      var source = new List<SeedItem>(target);
      var extendSeedLength = minimumSeedLength;
      while (extendSeedLength < maximumSeedLength)
      {
        extendSeedLength++;

        //check the coverage in smallRNA
        if (t2c != null)
        {
          var extendCoverage = t2c.Coverages.Skip(offset).Take(extendSeedLength).Average();
          if (extendCoverage < minimumCoverage)
          {
            break;
          }
        }

        if (seq.Length < offset + extendSeedLength)
        {
          individualSeeds.AddRange(source);
          break;
        }

        var extendSeed = seq.Substring(offset, extendSeedLength);

        var extendTarget = new List<SeedItem>();
        var doneTarget = new List<SeedItem>();
        foreach (var utrTarget in source)
        {
          var newoffset = utrTarget.Strand == '-' ? utrTarget.SourceOffset : utrTarget.SourceOffset - 1;
          if (newoffset < 0)
          {
            doneTarget.Add(utrTarget);
            continue;
          }

          var newseed = GetSeed(utrTarget.Source, newoffset, extendSeedLength, minimumCoverage);
          if (newseed == null)
          {
            doneTarget.Add(utrTarget);
            continue;
          }

          if (!extendSeed.Equals(newseed.Sequence))
          {
            doneTarget.Add(utrTarget);
            continue;
          }

          extendTarget.Add(newseed);
        }

        individualSeeds.AddRange(doneTarget);

        if (extendTarget.Count > 0)
        {
          source = extendTarget;
        }
        else
        {
          break;
        }
      }

      //For each gene, only the longest one will be kept
      var individualGenes = (from cs in individualSeeds.GroupBy(m => m.GeneSymbol)
                let csitem = cs.GroupBy(l => l.Length).OrderByDescending(k => k.Key).First()
                from ls in csitem
                select ls).OrderByDescending(l => l.Length).ToList();

      //Merge the seeds with same gene symbol, same location but different name
      var final = new List<SeedItem>();
      var rmap = individualGenes.GroupBy(l => l.GeneSymbol + "_" + l.GetLocation()).ToList();
      foreach (var rm in rmap)
      {
        var item = rm.First();
        item.Name = (from r in rm
                     select r.Name).Merge("/");
        final.Add(item);
      }

      return final.OrderByDescending(l => l.Length).ToList();
    }
예제 #10
0
    /// <summary>
    /// Transfer bed format (zero-based) to gff format (one-based)
    /// </summary>
    /// <param name="options"></param>
    /// <param name="progress"></param>
    /// <returns></returns>
    public static List<CoverageRegion> GetTargetCoverageRegionFromBed(ITargetBuilderOptions options, IProgressCallback progress)
    {
      var result = new List<CoverageRegion>();

      var groups = new BedItemFile<BedItem>().ReadFromFile(options.TargetFile);
      progress.SetMessage("Total {0} potential target group read from file {1}", groups.Count, options.TargetFile);

      foreach (var utr in groups)
      {
        var rg = new CoverageRegion();
        rg.Name = utr.Name;
        rg.Seqname = utr.Seqname.StringAfter("chr");
        rg.Start = utr.Start + 1;
        rg.End = utr.End;
        rg.Strand = utr.Strand;
        for (var i = rg.Start; i < rg.End; i++)
        {
          rg.Coverages.Add(1000);
        }
        result.Add(rg);
      }

      return result;
    }
예제 #11
0
    public static List<CoverageRegion> GetTargetCoverageRegionFromXml(ITargetBuilderOptions options, IProgressCallback progress)
    {
      var result = new List<CoverageRegion>();

      var groups = new FeatureItemGroupXmlFormat().ReadFromFile(options.TargetFile);
      progress.SetMessage("Total {0} potential target group read from file {1}", groups.Count, options.TargetFile);

      foreach (var group in groups)
      {
        //since the items in same group shared same reads, only the first one will be used.
        for (int i = 1; i < group.Count; i++)
        {
          group[0].Name = group[0].Name + "/" + group[i].Name;
        }

        group.RemoveRange(1, group.Count - 1);

        var utr = group[0];

        utr.Locations.RemoveAll(m => m.SamLocations.Count == 0);
        utr.CombineLocationByMappedReads();

        foreach (var loc in utr.Locations)
        {
          var map = new Dictionary<long, int>();
          foreach (var sloc in loc.SamLocations)
          {
            for (long i = sloc.SamLocation.Start; i <= sloc.SamLocation.End; i++)
            {
              int count;
              if (map.TryGetValue(i, out count))
              {
                map[i] = count + sloc.SamLocation.Parent.QueryCount;
              }
              else
              {
                map[i] = sloc.SamLocation.Parent.QueryCount;
              }
            }
          }

          var keys = (from k in map.Keys
                      orderby k
                      select k).ToList();

          int start = 0;
          int end = start + 1;
          while (true)
          {
            if (end == keys.Count || keys[end] != keys[end - 1] + 1)
            {
              var rg = new CoverageRegion();
              rg.Name = utr.Name;
              rg.Seqname = loc.Seqname;
              rg.Start = keys[start];
              rg.End = keys[end - 1];
              rg.Strand = loc.Strand;
              for (int i = start; i < end; i++)
              {
                rg.Coverages.Add(map[keys[i]]);
              }
              result.Add(rg);

              if (end == keys.Count)
              {
                break;
              }

              start = end;
              end = start + 1;
            }
            else
            {
              end++;
            }
          }
        }
      }

      return result;
    }
예제 #12
0
        public static List <SeedItem> ExtendToLongestTarget(List <SeedItem> target, CoverageRegion t2c, string seq, int offset, int minimumSeedLength, int maximumSeedLength, double minimumCoverage)
        {
            var individualSeeds  = new List <SeedItem>();
            var source           = new List <SeedItem>(target);
            var extendSeedLength = minimumSeedLength;

            while (extendSeedLength < maximumSeedLength)
            {
                extendSeedLength++;

                //check the coverage in smallRNA
                if (t2c != null)
                {
                    var extendCoverage = t2c.Coverages.Skip(offset).Take(extendSeedLength).Average(l => l.Coverage);
                    if (extendCoverage < minimumCoverage)
                    {
                        break;
                    }
                }

                if (seq.Length < offset + extendSeedLength)
                {
                    individualSeeds.AddRange(source);
                    break;
                }

                var extendSeed = seq.Substring(offset, extendSeedLength);

                var extendTarget = new List <SeedItem>();
                var doneTarget   = new List <SeedItem>();
                foreach (var utrTarget in source)
                {
                    var newoffset = utrTarget.Strand == '-' ? utrTarget.SourceOffset : utrTarget.SourceOffset - 1;
                    if (newoffset < 0)
                    {
                        doneTarget.Add(utrTarget);
                        continue;
                    }

                    var newseed = GetSeed(utrTarget.Source, newoffset, extendSeedLength, minimumCoverage);
                    if (newseed == null)
                    {
                        doneTarget.Add(utrTarget);
                        continue;
                    }

                    if (!extendSeed.Equals(newseed.Sequence))
                    {
                        doneTarget.Add(utrTarget);
                        continue;
                    }

                    extendTarget.Add(newseed);
                }

                individualSeeds.AddRange(doneTarget);

                if (extendTarget.Count > 0)
                {
                    source = extendTarget;
                }
                else
                {
                    break;
                }
            }

            //For each gene, only the longest one will be kept
            var individualGenes = (from cs in individualSeeds.GroupBy(m => m.GeneSymbol)
                                   let csitem = cs.GroupBy(l => l.Length).OrderByDescending(k => k.Key).First()
                                                from ls in csitem
                                                select ls).OrderByDescending(l => l.Length).ToList();

            //Merge the seeds with same gene symbol, same location but different name
            var final = new List <SeedItem>();
            var rmap  = individualGenes.GroupBy(l => l.GeneSymbol + "_" + l.GetLocation()).ToList();

            foreach (var rm in rmap)
            {
                var item = rm.First();
                item.Name = (from r in rm
                             select r.Name).Merge("/");
                final.Add(item);
            }

            return(final.OrderByDescending(l => l.Length).ToList());
        }
예제 #13
0
        public static List <CoverageRegion> GetTargetCoverageRegionFromXml(ITargetBuilderOptions options, IProgressCallback progress)
        {
            var result = new List <CoverageRegion>();

            var groups = new FeatureItemGroupXmlFormat().ReadFromFile(options.TargetFile);

            progress.SetMessage("Total {0} potential target group read from file {1}", groups.Count, options.TargetFile);

            foreach (var group in groups)
            {
                //since the items in same group shared same reads, only the first one will be used.
                for (int i = 1; i < group.Count; i++)
                {
                    group[0].Name = group[0].Name + "/" + group[i].Name;
                }

                group.RemoveRange(1, group.Count - 1);

                var utr = group[0];

                utr.Locations.RemoveAll(m => m.SamLocations.Count == 0);
                utr.CombineLocationByMappedReads();

                foreach (var loc in utr.Locations)
                {
                    var map = new Dictionary <long, CoverageSite>();
                    foreach (var sloc in loc.SamLocations)
                    {
                        for (long i = sloc.SamLocation.Start; i <= sloc.SamLocation.End; i++)
                        {
                            CoverageSite count;
                            if (map.TryGetValue(i, out count))
                            {
                                count.Coverage = count.Coverage + sloc.SamLocation.Parent.QueryCount;
                                count.UniqueRead.Add(sloc.SamLocation.Parent.Qname);
                            }
                            else
                            {
                                map[i] = new CoverageSite(sloc.SamLocation.Parent.QueryCount, sloc.SamLocation.Parent.Qname);
                            }
                        }
                    }

                    var keys = (from k in map.Keys
                                orderby k
                                select k).ToList();

                    int start = 0;
                    int end   = start + 1;
                    while (true)
                    {
                        if (end == keys.Count || keys[end] != keys[end - 1] + 1)
                        {
                            var rg = new CoverageRegion();
                            rg.Name    = utr.Name;
                            rg.Seqname = loc.Seqname;
                            rg.Start   = keys[start];
                            rg.End     = keys[end - 1];
                            rg.Strand  = loc.Strand;
                            for (int i = start; i < end; i++)
                            {
                                rg.Coverages.Add(map[keys[i]]);
                            }
                            result.Add(rg);

                            if (end == keys.Count)
                            {
                                break;
                            }

                            start = end;
                            end   = start + 1;
                        }
                        else
                        {
                            end++;
                        }
                    }
                }
            }

            return(result);
        }