public List<ChromosomeCountSlimItem> Build(string fileName)
    {
      if (File.Exists(options.CategoryMapFile))
      {
        Progress.SetMessage("Reading name map file " + options.CategoryMapFile + " ...");
        nameMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile).ToDictionary(m => m.Key, m => m.Value.Value);
      }

      var result = new List<ChromosomeCountSlimItem>();

      var queries = new Dictionary<string, SAMChromosomeItem>();
      var chromosomes = new Dictionary<string, ChromosomeCountSlimItem>();

      Progress.SetMessage("Parsing alignment file " + fileName + " ...");
      using (var sr = SAMFactory.GetReader(fileName, true))
      {
        int count = 0;
        int waitingcount = 0;
        string line;
        while ((line = sr.ReadLine()) != null)
        {
          if (count % 1000 == 0)
          {
            if (Progress.IsCancellationPending())
            {
              throw new UserTerminatedException();
            }
          }

          if (count % 100000 == 0 && count > 0)
          {
            Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count);
          }

          count++;

          var parts = line.Split('\t');

          SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);

          //unmatched
          if (flag.HasFlag(SAMFlags.UnmappedQuery))
          {
            continue;
          }

          var qname = parts[SAMFormatConst.QNAME_INDEX];
          SAMChromosomeItem query;
          if (!queries.TryGetValue(qname, out query))
          {
            query = new SAMChromosomeItem();
            query.Qname = qname;
            queries[qname] = query;

            if (options.KeepSequence)
            {
              query.Sequence = parts[SAMFormatConst.SEQ_INDEX];
              if (flag.HasFlag(SAMFlags.QueryOnReverseStrand))
              {
                query.Sequence = SequenceUtils.GetReverseComplementedSequence(query.Sequence);
              }
            }
          }

          var seqname = GetName(parts[SAMFormatConst.RNAME_INDEX]);
          query.Chromosomes.Add(seqname);

          ChromosomeCountSlimItem item;
          if (!chromosomes.TryGetValue(seqname, out item))
          {
            item = new ChromosomeCountSlimItem();
            item.Names.Add(seqname);
            chromosomes[seqname] = item;
            result.Add(item);
          }
          item.Queries.Add(query);

          waitingcount++;
        }

        Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count);
      }

      foreach (var query in queries.Values)
      {
        query.Chromosomes = query.Chromosomes.Distinct().OrderBy(m => m).ToList();
      }

      foreach (var sam in chromosomes.Values)
      {
        sam.Queries = sam.Queries.Distinct().OrderBy(m => m.Qname).ToList();
      }

      if (!string.IsNullOrEmpty(options.PreferPrefix))
      {
        foreach (var query in queries.Values)
        {
          if (query.Chromosomes.Any(l => l.StartsWith(options.PreferPrefix)))
          {
            var chroms = query.Chromosomes.Where(l => l.StartsWith(options.PreferPrefix)).ToArray();
            foreach (var chrom in chroms)
            {
              chromosomes[chrom].Queries.Remove(query);
              query.Chromosomes.Remove(chrom);
            }
          }
        }

        result.RemoveAll(l => l.Queries.Count == 0);
      }
      return result;
    }
    public override IEnumerable<string> Process()
    {
      var result = new List<string>();

      var countFiles = options.GetCountFiles();
      countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

      var format = new ChromosomeCountSlimItemXmlFormat(outputSample: true);

      var countMap = new Dictionary<string, ChromosomeCountSlimItem>();

      int fileIndex = 0;
      foreach (var file in countFiles)
      {
        fileIndex++;
        Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File);

        var curcounts = format.ReadFromFile(file.File);

        if (curcounts.Count > 0 && string.IsNullOrEmpty(curcounts[0].Queries[0].Sequence))
        {
          Console.WriteLine("Didn't read in the sequence of query " + curcounts[0].Queries[0].Qname);
        }
        curcounts.ForEach(m =>
        {
          foreach (var q in m.Queries)
          {
            q.Sample = file.Name;
          }
        });

        foreach (var c in curcounts)
        {
          var name = c.Names.First();
          ChromosomeCountSlimItem item;
          if (countMap.TryGetValue(name, out item))
          {
            item.Queries.AddRange(c.Queries);
          }
          else
          {
            countMap[name] = c;
          }
        }
      }

      var counts = countMap.Values.ToList();

      WriteOutput(options.OutputFile, countFiles, format, counts);

      result.Add(options.OutputFile);

      if (File.Exists(options.CategoryMapFile))
      {
        Progress.SetMessage("Reading category map ...");
        var categoryMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile);
        var queries = new HashSet<SAMChromosomeItem>(from c in counts
                                                     from q in c.Queries
                                                     select q);

        var dic = new Dictionary<string, ChromosomeCountSlimItem>();
        foreach (var q in queries)
        {
          q.Chromosomes = (from chrom in q.Chromosomes
                           select categoryMap[chrom].Value).Distinct().OrderBy(m => m).ToList();
          foreach (var chrom in q.Chromosomes)
          {
            ChromosomeCountSlimItem item;
            if (!dic.TryGetValue(chrom, out item))
            {
              item = new ChromosomeCountSlimItem();
              item.Names.Add(chrom);
              dic[chrom] = item;
            }
            item.Queries.Add(q);
          }
        }

        var catFile = Path.ChangeExtension(options.OutputFile, ".category" + Path.GetExtension(options.OutputFile));
        WriteOutput(catFile, countFiles, format, dic.Values.ToList());
        result.Add(catFile);
      }

      if (options.OutputReadTable || options.OutputReadContigTable)
      {
        Progress.SetMessage("Building sequence map...");
        var reads = SmallRNASequenceUtils.ConvertFrom(counts);

        if (options.OutputReadTable)
        {
          Progress.SetMessage("Saving read file...");
          var readOutput = Path.ChangeExtension(options.OutputFile, ".read" + Path.GetExtension(options.OutputFile));
          new SmallRNASequenceFormat(int.MaxValue, false).WriteToFile(readOutput, reads);
          result.Add(readOutput);
        }

        if (options.OutputReadContigTable)
        {
          Progress.SetMessage("Building sequence contig by similarity ...");
          var contigs = SmallRNASequenceUtils.BuildContigByIdenticalSimilarity(reads, options.MinimumOverlapRate, options.MaximumExtensionBase,  progress: Progress);

          Progress.SetMessage("Contig number = {0}", contigs.Count);

          Progress.SetMessage("Saving contig file...");
          var contigOutput = Path.ChangeExtension(options.OutputFile, ".contig" + Path.GetExtension(options.OutputFile));
          new SmallRNASequenceContigFormat().WriteToFile(contigOutput, contigs);
          result.Add(contigOutput);

          Progress.SetMessage("Saving sequence contig details...");
          new SmallRNASequenceContigDetailFormat().WriteToFile(contigOutput + ".details", contigs);
          result.Add(contigOutput + ".details");
        }
      }

      Progress.End();

      return result;
    }
Exemple #3
0
        public List <ChromosomeCountSlimItem> Build(string fileName)
        {
            if (File.Exists(options.CategoryMapFile))
            {
                Progress.SetMessage("Reading name map file " + options.CategoryMapFile + " ...");
                nameMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile).ToDictionary(m => m.Key, m => m.Value.Value);
            }

            var result = new List <ChromosomeCountSlimItem>();

            var queries     = new Dictionary <string, SAMChromosomeItem>();
            var chromosomes = new Dictionary <string, ChromosomeCountSlimItem>();

            Regex chromosomeRegex = null;
            Func <string, bool> acceptChromosome;

            if (string.IsNullOrEmpty(options.ChromosomePattern))
            {
                acceptChromosome = m => true;
            }
            else
            {
                chromosomeRegex  = new Regex(options.ChromosomePattern);
                acceptChromosome = m => chromosomeRegex.Match(m).Success;
            }

            Progress.SetMessage("Parsing alignment file " + fileName + " ...");
            using (var sr = SAMFactory.GetReader(fileName, true))
            {
                int    count        = 0;
                int    waitingcount = 0;
                string line;
                while ((line = sr.ReadLine()) != null)
                {
                    if (count % 1000 == 0)
                    {
                        if (Progress.IsCancellationPending())
                        {
                            throw new UserTerminatedException();
                        }
                    }

                    if (count % 100000 == 0 && count > 0)
                    {
                        Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count);
                    }

                    count++;

                    var parts = line.Split('\t');

                    SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);

                    //unmatched
                    if (flag.HasFlag(SAMFlags.UnmappedQuery))
                    {
                        continue;
                    }

                    var seqname = GetName(parts[SAMFormatConst.RNAME_INDEX]);
                    if (!acceptChromosome(seqname))
                    {
                        continue;
                    }

                    var qname = parts[SAMFormatConst.QNAME_INDEX];
                    SAMChromosomeItem query;
                    if (!queries.TryGetValue(qname, out query))
                    {
                        query          = new SAMChromosomeItem();
                        query.Qname    = qname;
                        queries[qname] = query;

                        if (options.KeepSequence)
                        {
                            query.Sequence = parts[SAMFormatConst.SEQ_INDEX];
                            if (flag.HasFlag(SAMFlags.QueryOnReverseStrand))
                            {
                                query.Sequence = SequenceUtils.GetReverseComplementedSequence(query.Sequence);
                            }
                        }
                    }

                    query.Chromosomes.Add(seqname);

                    ChromosomeCountSlimItem item;
                    if (!chromosomes.TryGetValue(seqname, out item))
                    {
                        item = new ChromosomeCountSlimItem();
                        item.Names.Add(seqname);
                        chromosomes[seqname] = item;
                        result.Add(item);
                    }
                    item.Queries.Add(query);

                    waitingcount++;
                }

                Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count);
            }

            foreach (var query in queries.Values)
            {
                query.Chromosomes = query.Chromosomes.Distinct().OrderBy(m => m).ToList();
            }

            foreach (var sam in chromosomes.Values)
            {
                sam.Queries = sam.Queries.Distinct().OrderBy(m => m.Qname).ToList();
            }

            if (!string.IsNullOrEmpty(options.PreferPrefix))
            {
                foreach (var query in queries.Values)
                {
                    if (query.Chromosomes.Any(l => l.StartsWith(options.PreferPrefix)))
                    {
                        var chroms = query.Chromosomes.Where(l => l.StartsWith(options.PreferPrefix)).ToArray();
                        foreach (var chrom in chroms)
                        {
                            chromosomes[chrom].Queries.Remove(query);
                            query.Chromosomes.Remove(chrom);
                        }
                    }
                }

                result.RemoveAll(l => l.Queries.Count == 0);
            }
            return(result);
        }
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            var countFiles = options.GetCountFiles();

            countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

            var format = new ChromosomeCountSlimItemXmlFormat(outputSample: true);

            var countMap = new Dictionary <string, ChromosomeCountSlimItem>();

            int fileIndex = 0;

            foreach (var file in countFiles)
            {
                fileIndex++;
                Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File);

                var curcounts = format.ReadFromFile(file.File);

                if (curcounts.Count > 0 && string.IsNullOrEmpty(curcounts[0].Queries[0].Sequence))
                {
                    Console.WriteLine("Didn't read in the sequence of query " + curcounts[0].Queries[0].Qname);
                }
                curcounts.ForEach(m =>
                {
                    foreach (var q in m.Queries)
                    {
                        q.Sample = file.Name;
                    }
                });

                foreach (var c in curcounts)
                {
                    var name = c.Names.First();
                    ChromosomeCountSlimItem item;
                    if (countMap.TryGetValue(name, out item))
                    {
                        item.Queries.AddRange(c.Queries);
                    }
                    else
                    {
                        countMap[name] = c;
                    }
                }
            }

            var counts = countMap.Values.ToList();

            WriteOutput(options.OutputFile, countFiles, format, counts);

            result.Add(options.OutputFile);

            if (File.Exists(options.CategoryMapFile))
            {
                Progress.SetMessage("Reading category map ...");
                var categoryMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile);
                var queries     = new HashSet <SAMChromosomeItem>(from c in counts
                                                                  from q in c.Queries
                                                                  select q);

                var dic = new Dictionary <string, ChromosomeCountSlimItem>();
                foreach (var q in queries)
                {
                    q.Chromosomes = (from chrom in q.Chromosomes
                                     select categoryMap[chrom].Value).Distinct().OrderBy(m => m).ToList();
                    foreach (var chrom in q.Chromosomes)
                    {
                        ChromosomeCountSlimItem item;
                        if (!dic.TryGetValue(chrom, out item))
                        {
                            item = new ChromosomeCountSlimItem();
                            item.Names.Add(chrom);
                            dic[chrom] = item;
                        }
                        item.Queries.Add(q);
                    }
                }

                var catFile = Path.ChangeExtension(options.OutputFile, ".category" + Path.GetExtension(options.OutputFile));
                WriteOutput(catFile, countFiles, format, dic.Values.ToList());
                result.Add(catFile);
            }

            if (options.OutputReadTable || options.OutputReadContigTable)
            {
                Progress.SetMessage("Building sequence map...");
                var reads = SmallRNASequenceUtils.ConvertFrom(counts);

                if (options.OutputReadTable)
                {
                    Progress.SetMessage("Saving read file...");
                    var readOutput = Path.ChangeExtension(options.OutputFile, ".read" + Path.GetExtension(options.OutputFile));
                    new SmallRNASequenceFormat(int.MaxValue, false).WriteToFile(readOutput, reads);
                    result.Add(readOutput);
                }

                if (options.OutputReadContigTable)
                {
                    Progress.SetMessage("Building sequence contig by similarity ...");
                    var contigs = SmallRNASequenceUtils.BuildContigByIdenticalSimilarity(reads, options.MinimumOverlapRate, options.MaximumExtensionBase, progress: Progress);

                    Progress.SetMessage("Contig number = {0}", contigs.Count);

                    Progress.SetMessage("Saving contig file...");
                    var contigOutput = Path.ChangeExtension(options.OutputFile, ".contig" + Path.GetExtension(options.OutputFile));
                    new SmallRNASequenceContigFormat().WriteToFile(contigOutput, contigs);
                    result.Add(contigOutput);

                    Progress.SetMessage("Saving sequence contig details...");
                    new SmallRNASequenceContigDetailFormat().WriteToFile(contigOutput + ".details", contigs);
                    result.Add(contigOutput + ".details");
                }
            }

            Progress.End();

            return(result);
        }