예제 #1
0
    public static List<CoverageRegion> GetSmallRNACoverageRegion(string mappedFeatureXmlFile, string[] includeSmallRNATags = null, string[] excudeSmallRNATags = null)
    {
      var smallRNAGroups = new FeatureItemGroupXmlFormat().ReadFromFile(mappedFeatureXmlFile);

      if (includeSmallRNATags != null && includeSmallRNATags.Length > 0)
      {
        smallRNAGroups.ForEach(m => m.RemoveAll(l => includeSmallRNATags.All(k => !m.Name.StartsWith(k))));
        smallRNAGroups.RemoveAll(m => m.Count == 0);
      }

      if (excudeSmallRNATags != null && excudeSmallRNATags.Length > 0)
      {
        smallRNAGroups.ForEach(m => m.RemoveAll(l => excudeSmallRNATags.Any(k => m.Name.StartsWith(k))));
        smallRNAGroups.RemoveAll(m => m.Count == 0);
      }

      var result = new List<CoverageRegion>();
      foreach (var sg in smallRNAGroups)
      {
        //since the items in same group shared same reads, only the first one will be used.
        var smallRNA = sg[0];
        smallRNA.Name = (from g in sg select g.Name).Merge("/");

        smallRNA.Locations.RemoveAll(m => m.SamLocations.Count == 0);
        smallRNA.CombineLocationByMappedReads();

        //only first location will be used.
        var loc = smallRNA.Locations[0];

        //coverage in all position will be set as same as total query count
        var rg = new CoverageRegion();
        rg.Name = smallRNA.Name;
        rg.Seqname = loc.Seqname;
        rg.Start = loc.Start;
        rg.End = loc.End;
        rg.Strand = loc.Strand;
        rg.Sequence = loc.Sequence;

        var coverage = (from sloc in loc.SamLocations select sloc.SamLocation.Parent.QueryCount).Sum();

        for (int i = 0; i < loc.Length; i++)
        {
          rg.Coverages.Add(coverage);
        }
        result.Add(rg);
      }
      return result;
    }
예제 #2
0
    public static List<CoverageRegion> GetTargetCoverageRegionFromXml(ITargetBuilderOptions options, IProgressCallback progress)
    {
      var result = new List<CoverageRegion>();

      var groups = new FeatureItemGroupXmlFormat().ReadFromFile(options.TargetFile);
      progress.SetMessage("Total {0} potential target group read from file {1}", groups.Count, options.TargetFile);

      foreach (var group in groups)
      {
        //since the items in same group shared same reads, only the first one will be used.
        for (int i = 1; i < group.Count; i++)
        {
          group[0].Name = group[0].Name + "/" + group[i].Name;
        }

        group.RemoveRange(1, group.Count - 1);

        var utr = group[0];

        utr.Locations.RemoveAll(m => m.SamLocations.Count == 0);
        utr.CombineLocationByMappedReads();

        foreach (var loc in utr.Locations)
        {
          var map = new Dictionary<long, int>();
          foreach (var sloc in loc.SamLocations)
          {
            for (long i = sloc.SamLocation.Start; i <= sloc.SamLocation.End; i++)
            {
              int count;
              if (map.TryGetValue(i, out count))
              {
                map[i] = count + sloc.SamLocation.Parent.QueryCount;
              }
              else
              {
                map[i] = sloc.SamLocation.Parent.QueryCount;
              }
            }
          }

          var keys = (from k in map.Keys
                      orderby k
                      select k).ToList();

          int start = 0;
          int end = start + 1;
          while (true)
          {
            if (end == keys.Count || keys[end] != keys[end - 1] + 1)
            {
              var rg = new CoverageRegion();
              rg.Name = utr.Name;
              rg.Seqname = loc.Seqname;
              rg.Start = keys[start];
              rg.End = keys[end - 1];
              rg.Strand = loc.Strand;
              for (int i = start; i < end; i++)
              {
                rg.Coverages.Add(map[keys[i]]);
              }
              result.Add(rg);

              if (end == keys.Count)
              {
                break;
              }

              start = end;
              end = start + 1;
            }
            else
            {
              end++;
            }
          }
        }
      }

      return result;
    }
    public List<FeatureItemGroup> Build(string countXmlFile)
    {
      var result = new FeatureItemGroupXmlFormat().ReadFromFile(countXmlFile);

      Progress.SetMessage("There are {0} groups in {1}", result.Count, countXmlFile);

      result.ForEach(g => g.ForEach(smallRNA => smallRNA.Locations.ForEach(region => region.QueryCountBeforeFilter = region.QueryCount)));

      //no number of no penalty mutation defined, check the T2C
      if (result.All(m => m.All(l => l.Locations.All(k => k.SamLocations.All(s => s.NumberOfNoPenaltyMutation == 0)))))
      {
        foreach (var group in result)
        {
          foreach (var smallRNA in group)
          {
            smallRNA.Locations.RemoveAll(m => m.SamLocations.Count == 0);
            foreach (var region in smallRNA.Locations)
            {
              region.SamLocations.ForEach(q =>
              {
                var snp = q.SamLocation.GetNotGsnapMismatch(q.SamLocation.Parent.Sequence);
                if (null != snp && snp.IsMutation('T', 'C'))
                {
                  q.NumberOfMismatch = q.SamLocation.NumberOfMismatch - 1;
                  q.NumberOfNoPenaltyMutation = 1;
                }
                else
                {
                  q.NumberOfMismatch = q.SamLocation.NumberOfMismatch;
                  q.NumberOfNoPenaltyMutation = 0;
                }
              });
            }
          }
        }
      }

      result.RemoveAll(m =>
      {
        m.RemoveAll(l =>
        {
          l.Locations.RemoveAll(k =>
          {
            k.SamLocations.RemoveAll(s => s.NumberOfNoPenaltyMutation == 0);
            return k.SamLocations.Count == 0;
          });

          return l.Locations.Count == 0;
        });

        return m.Count == 0;
      });

      Progress.SetMessage("There are {0} groups having T2C mutation", result.Count);

      foreach (var group in result)
      {
        foreach (var smallRNA in group)
        {
          foreach (var region in smallRNA.Locations)
          {
            region.PValue = CalculateT2CPvalue(region.QueryCountBeforeFilter, region.QueryCount, this.t2cRate);
          }
        }
      }

      return result;
    }
예제 #4
0
    public override IEnumerable<string> Process()
    {
      var countfiles = options.GetCountFiles();

      Dictionary<string, FeatureItem> featureMap = new Dictionary<string, FeatureItem>();
      List<string> samples = new List<string>();
      for (int i = 0; i < countfiles.Count; i++)
      {
        var file = countfiles[i];
        samples.Add(file.Name);

        Progress.SetMessage("Reading {0}/{1} {2}...", i + 1, countfiles.Count, file.File);
        var mapped = new FeatureItemGroupXmlFormat().ReadFromFile(file.File);
        mapped.GetQueries().ForEach(m => m.Sample = file.Name);

        //merge data by feature
        foreach (var group in mapped)
        {
          foreach (var curFeature in group)
          {
            FeatureItem existFeature;
            if (featureMap.TryGetValue(curFeature.Name, out existFeature))
            {
              var existLocationMap = existFeature.Locations.ToDictionary(l => l.GetLocation());
              foreach (var curLocation in curFeature.Locations)
              {
                FeatureLocation existLocation;
                if (existLocationMap.TryGetValue(curLocation.GetLocation(), out existLocation))
                {
                  existLocation.SamLocations.AddRange(curLocation.SamLocations);
                }
                else
                {
                  existFeature.Locations.Add(curLocation);
                }
              }
            }
            else // add to feature map
            {
              featureMap[curFeature.Name] = curFeature;
            }
          }
        }
      }

      var features = featureMap.Values.ToList();

      samples.Sort();

      var allGroups = new List<FeatureItemGroup>();
      var result = new List<string>();

      //output miRNA
      Progress.SetMessage("Grouping microRNA by sequence ...");
      var miRNAGroup = features.Where(m => m.Name.StartsWith(SmallRNAConsts.miRNA)).GroupBySequence().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList();

      //Progress.SetMessage("Writing microRNA xml file ...");
      //new FeatureItemGroupXmlFormat().WriteToFile(options.OutputFile + ".miRNA.xml", miRNAGroup);

      Progress.SetMessage("Writing microRNA ...");
      var miRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.miRNA + ".count");
      result.AddRange(new MirnaNTACountTableWriter().WriteToFile(miRNAFile, miRNAGroup, samples, SmallRNAConsts.miRNA + ":"));
      allGroups.AddRange(miRNAGroup);

      //output tRNA
      Progress.SetMessage("Grouping tRNA by amino acid code ...");
      var tRNAs = features.Where(m => m.Name.StartsWith(SmallRNAConsts.tRNA)).ToList();
      var tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTRNACode).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList();
      var tRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".count");
      Progress.SetMessage("Writing tRNA ...");
      result.AddRange(new SmallRNACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":"));
      allGroups.AddRange(tRNAGroup);

      //output tRNA
      Progress.SetMessage("Grouping tRNA by amino acid ...");
      tRNAGroup = tRNAs.GroupByFunction(SmallRNAUtils.GetTRNAAminoacid, true).OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList();
      tRNAFile = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".aminoacid.count");
      Progress.SetMessage("Writing tRNA aminoacid...");
      result.AddRange(new SmallRNACountTableWriter().WriteToFile(tRNAFile, tRNAGroup, samples, SmallRNAConsts.tRNA + ":"));

      //Progress.SetMessage("Grouping tRNA by identical query ...");
      //var tRNAGroup2 = tRNAs.GroupByIdenticalQuery().OrderByDescending(m => m.GetEstimateCount()).ThenBy(m => m.Name).ToList();
      //var tRNAFile2 = Path.ChangeExtension(options.OutputFile, SmallRNAConsts.tRNA + ".byquery.count");
      //Progress.SetMessage("Writing tRNA ...");
      //result.AddRange(new SmallRNACountTableWriter().WriteToFile(tRNAFile2, tRNAGroup2, samples, SmallRNAConsts.tRNA + ":"));

      //output other smallRNA
      Progress.SetMessage("Grouping other smallRNA by identical query ...");
      var otherGroups = features.Where(m => !m.Name.StartsWith(SmallRNAConsts.miRNA) && !m.Name.StartsWith(SmallRNAConsts.tRNA)).GroupByIdenticalQuery().OrderByDescending(m => m.GetEstimatedCount()).ThenBy(m => m.Name).ToList();
      var otherFile = Path.ChangeExtension(options.OutputFile, ".other.count");
      Progress.SetMessage("Writing other smallRNA ...");
      result.AddRange(new SmallRNACountTableWriter().WriteToFile(otherFile, otherGroups, samples, ""));

      var otherSequenceFile = Path.ChangeExtension(options.OutputFile, ".other.sequence.count");
      result.AddRange(new SmallRNACountTableSequenceWriter().WriteToFile(otherSequenceFile, otherGroups, ""));
      allGroups.AddRange(otherGroups);

      //new FeatureItemGroupXmlFormat().WriteToFile(options.OutputFile + ".other.xml", miRNAGroup);

      //output all smallRNA
      Progress.SetMessage("Writing all smallRNA ...");
      result.AddRange(new SmallRNACountTableWriter().WriteToFile(options.OutputFile, allGroups, samples, ""));

      Progress.SetMessage("Done ...");
      return result;
    }
    public override IEnumerable<string> Process()
    {
      var sampleInfos = new List<SampleCount>();
      using (var sw = new StreamWriter(options.OutputFile))
      using (var swUnfiltered = new StreamWriter(Path.ChangeExtension(options.OutputFile, ".unfiltered.tsv")))
      {
        var header = "File\tCategory\tName\tUniqueRead\tUniqueT2CRead\tUniqueT2CRate\tAvergeT2CIn10BasesOfUniqueRead\tAvergeT2COfUniqueRead\tTotalRead\tTotalT2CRead\tTotalT2CRate\tT2C_pvalue\tAverageT2CIn10BasesOfTotalRead\tAverageT2COfTotalRead";
        swUnfiltered.WriteLine(header);
        sw.WriteLine(header);

        var inputFiles = options.GetCountXmlFiles();

        foreach (var file in inputFiles)
        {
          var sc = new SampleCount();
          sc.Name = file.Name;
          sampleInfos.Add(sc);

          var subjects = new FeatureItemGroupXmlFormat().ReadFromFile(file.File);
          var group = subjects.GroupBy(m => m[0].Name.StringBefore(":")).ToList();
          foreach (var g in group)
          {
            var items = g.ToList();
            foreach (var item in items)
            {
              var queries = new HashSet<string>(item.GetAlignedLocations().ConvertAll(l => l.Parent.Qname));
              List<FeatureSamLocation> locs = new List<FeatureSamLocation>();
              foreach (var l in item)
              {
                foreach (var loc in l.Locations)
                {
                  foreach (var sl in loc.SamLocations)
                  {
                    if (queries.Contains(sl.SamLocation.Parent.Qname))
                    {
                      locs.Add(sl);
                      queries.Remove(sl.SamLocation.Parent.Qname);
                    }
                  }
                }
              }

              var t2c = locs.Where(m => m.NumberOfNoPenaltyMutation > 0).ToList();
              var ave_t2c_uniquereads = (t2c.Count > 0) ? t2c.ConvertAll(m => m.NumberOfNoPenaltyMutation * 10.0 / m.SamLocation.Parent.Sequence.Length).Average() : 0.0;
              var ave_t2c_perread_uniquereads = (t2c.Count > 0) ? t2c.ConvertAll(m => m.NumberOfNoPenaltyMutation).Average() : 0.0;

              double ave_t2c_allreads = 0.0;
              double ave_t2c_perread_allreads = 0.0;
              if (t2c.Count > 0)
              {
                List<double> values = new List<double>();
                List<double> perread_values = new List<double>();
                foreach (var t2citem in t2c)
                {
                  var v = t2citem.NumberOfNoPenaltyMutation * 10.0 / t2citem.SamLocation.Parent.Sequence.Length;
                  for (int i = 0; i < t2citem.SamLocation.Parent.QueryCount; i++)
                  {
                    values.Add(v);
                    perread_values.Add(t2citem.NumberOfNoPenaltyMutation);
                  }
                }
                ave_t2c_allreads = values.Average();
                ave_t2c_perread_allreads = perread_values.Average();
              }

              var totalCount = locs.Sum(l => l.SamLocation.Parent.QueryCount);
              var totalT2CCount = t2c.Sum(l => l.SamLocation.Parent.QueryCount);
              var pvalue = SmallRNAT2CMutationBuilder.CalculateT2CPvalue(totalCount, totalT2CCount, options.ExpectRate);
              var t2crate = totalT2CCount == 0 ? 0 : totalT2CCount * 1.0 / totalCount;
              var value = string.Format("{0}\t{1}\t{2}\t{3:0.###}\t{4:0.###}\t{5:0.###}\t{6:0.###}\t{7:0.###}\t{8:0.###}\t{9:0.###}\t{10:0.###}\t{11:0.###E+0}\t{12:0.###}\t{13:0.###}",
                file.Name,
                g.Key,
                item.Name,
                locs.Count,
                t2c.Count,
                t2c.Count * 1.0 / locs.Count,
                ave_t2c_uniquereads,
                ave_t2c_perread_uniquereads,
                totalCount,
                totalT2CCount,
                t2crate,
                pvalue,
                ave_t2c_allreads,
                ave_t2c_perread_allreads);

              swUnfiltered.WriteLine(value);
              if(!ParclipSmallRNAT2CBuilder.Accept(pvalue, totalCount, totalT2CCount, options.Pvalue, options.MinimumCount, options.ExpectRate))
              {
                continue;
              }

              sw.WriteLine(value);

              sc.GoodReadCount += totalCount;
              sc.GoodT2CReadCount += totalT2CCount;
              if (g.Key.Equals(SmallRNAConsts.miRNA))
              {
                sc.MiRNACount++;
              }
              else if (g.Key.Equals(SmallRNAConsts.tRNA))
              {
                sc.TRNACount++;
              }
              else
              {
                sc.OtherSmallRNACount++;
              }
            }
          }
        }
      }
      using (var sw = new StreamWriter(options.OutputFile + ".summary"))
      {
        sw.WriteLine("File\tTotalRead\tT2CRead\tT2CRate\tSmallRNA\tMicroRNA\ttRNA\tOtherSmallRNA");
        foreach (var si in sampleInfos)
        {
          sw.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}",
            si.Name,
            si.GoodReadCount,
            si.GoodT2CReadCount,
            si.GoodT2CRate,
            si.SmallRNACount,
            si.MiRNACount,
            si.TRNACount,
            si.OtherSmallRNACount);
        }
      }

      return new[] { Path.GetFullPath(options.OutputFile), Path.GetFullPath(options.OutputFile + ".summary") };
    }
    public override IEnumerable<string> Process()
    {
      var result = new List<string>();

      var except = new HashSet<string>();
      if (File.Exists(options.XmlFile))
      {
        //exclude the reads mapped to features no matter how many number of mismatch it has
        var allmapped = new FeatureItemGroupXmlFormat().ReadFromFile(options.XmlFile);
        except.UnionWith(from g in allmapped
                         from f in g
                         from l in f.Locations
                         from sl in l.SamLocations
                         select sl.SamLocation.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG));
      }

      if (File.Exists(options.ExcludeFile))
      {
        except.UnionWith(from l in File.ReadAllLines(options.ExcludeFile)
                         select l.StringBefore(SmallRNAConsts.NTA_TAG));
      }

      SmallRNACountMap cm = options.GetCountMap();
      var keys = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray();
      foreach (var key in keys)
      {
        cm.Counts[key.StringBefore(SmallRNAConsts.NTA_TAG)] = cm.Counts[key];
      }
      StreamWriter swCount = null;
      if (File.Exists(options.CountFile))
      {
        swCount = new StreamWriter(options.OutputFile + ".dupcount");
      }

      Progress.SetMessage("output unmapped query...");
      try
      {
        using (var sw = StreamUtils.GetWriter(options.OutputFile, options.OutputFile.ToLower().EndsWith(".gz")))
        {
          using (var sr = StreamUtils.GetReader(options.InputFile))
          {
            FastqReader reader = new FastqReader();
            FastqWriter writer = new FastqWriter();

            FastqSequence ss;
            var count = 0;
            while ((ss = reader.Parse(sr)) != null)
            {
              count++;

              if (count % 100000 == 0)
              {
                Progress.SetMessage("{0} reads", count);
                if (Progress.IsCancellationPending())
                {
                  throw new UserTerminatedException();
                }
              }

              ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description;
              if (except.Contains(ss.Name))
              {
                continue;
              }

              if (Accept != null && !Accept(ss))
              {
                continue;
              }

              except.Add(ss.Name);
              writer.Write(sw, ss);

              if (swCount != null)
              {
                int cmcount;
                if (!cm.Counts.TryGetValue(ss.Name, out cmcount))
                {
                  throw new Exception(string.Format("Cannot find {0} in count map", ss.Name));
                }
                swCount.WriteLine("{0}\t{1}", ss.Name, cmcount);
              }
            }
          }
        }
      }
      finally
      {
        if (swCount != null)
        {
          swCount.Close();
        }
      }

      Progress.End();

      return result;
    }