Esempio n. 1
0
        public List <ChromosomeCountSlimItem> Build(string fileName)
        {
            if (File.Exists(options.CategoryMapFile))
            {
                Progress.SetMessage("Reading name map file " + options.CategoryMapFile + " ...");
                nameMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile).ToDictionary(m => m.Key, m => m.Value.Value);
            }

            var result = new List <ChromosomeCountSlimItem>();

            var queries     = new Dictionary <string, SAMChromosomeItem>();
            var chromosomes = new Dictionary <string, ChromosomeCountSlimItem>();

            Regex chromosomeRegex = null;
            Func <string, bool> acceptChromosome;

            if (string.IsNullOrEmpty(options.ChromosomePattern))
            {
                acceptChromosome = m => true;
            }
            else
            {
                chromosomeRegex  = new Regex(options.ChromosomePattern);
                acceptChromosome = m => chromosomeRegex.Match(m).Success;
            }

            Progress.SetMessage("Parsing alignment file " + fileName + " ...");
            using (var sr = SAMFactory.GetReader(fileName, true))
            {
                int    count        = 0;
                int    waitingcount = 0;
                string line;
                while ((line = sr.ReadLine()) != null)
                {
                    if (count % 1000 == 0)
                    {
                        if (Progress.IsCancellationPending())
                        {
                            throw new UserTerminatedException();
                        }
                    }

                    if (count % 100000 == 0 && count > 0)
                    {
                        Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count);
                    }

                    count++;

                    var parts = line.Split('\t');

                    SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);

                    //unmatched
                    if (flag.HasFlag(SAMFlags.UnmappedQuery))
                    {
                        continue;
                    }

                    var seqname = GetName(parts[SAMFormatConst.RNAME_INDEX]);
                    if (!acceptChromosome(seqname))
                    {
                        continue;
                    }

                    var qname = parts[SAMFormatConst.QNAME_INDEX];
                    SAMChromosomeItem query;
                    if (!queries.TryGetValue(qname, out query))
                    {
                        query          = new SAMChromosomeItem();
                        query.Qname    = qname;
                        queries[qname] = query;

                        if (options.KeepSequence)
                        {
                            query.Sequence = parts[SAMFormatConst.SEQ_INDEX];
                            if (flag.HasFlag(SAMFlags.QueryOnReverseStrand))
                            {
                                query.Sequence = SequenceUtils.GetReverseComplementedSequence(query.Sequence);
                            }
                        }
                    }

                    query.Chromosomes.Add(seqname);

                    ChromosomeCountSlimItem item;
                    if (!chromosomes.TryGetValue(seqname, out item))
                    {
                        item = new ChromosomeCountSlimItem();
                        item.Names.Add(seqname);
                        chromosomes[seqname] = item;
                        result.Add(item);
                    }
                    item.Queries.Add(query);

                    waitingcount++;
                }

                Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count);
            }

            foreach (var query in queries.Values)
            {
                query.Chromosomes = query.Chromosomes.Distinct().OrderBy(m => m).ToList();
            }

            foreach (var sam in chromosomes.Values)
            {
                sam.Queries = sam.Queries.Distinct().OrderBy(m => m.Qname).ToList();
            }

            if (!string.IsNullOrEmpty(options.PreferPrefix))
            {
                foreach (var query in queries.Values)
                {
                    if (query.Chromosomes.Any(l => l.StartsWith(options.PreferPrefix)))
                    {
                        var chroms = query.Chromosomes.Where(l => l.StartsWith(options.PreferPrefix)).ToArray();
                        foreach (var chrom in chroms)
                        {
                            chromosomes[chrom].Queries.Remove(query);
                            query.Chromosomes.Remove(chrom);
                        }
                    }
                }

                result.RemoveAll(l => l.Queries.Count == 0);
            }
            return(result);
        }
    public List<ChromosomeCountSlimItem> Build(string fileName)
    {
      if (File.Exists(options.CategoryMapFile))
      {
        Progress.SetMessage("Reading name map file " + options.CategoryMapFile + " ...");
        nameMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile).ToDictionary(m => m.Key, m => m.Value.Value);
      }

      var result = new List<ChromosomeCountSlimItem>();

      var queries = new Dictionary<string, SAMChromosomeItem>();
      var chromosomes = new Dictionary<string, ChromosomeCountSlimItem>();

      Progress.SetMessage("Parsing alignment file " + fileName + " ...");
      using (var sr = SAMFactory.GetReader(fileName, true))
      {
        int count = 0;
        int waitingcount = 0;
        string line;
        while ((line = sr.ReadLine()) != null)
        {
          if (count % 1000 == 0)
          {
            if (Progress.IsCancellationPending())
            {
              throw new UserTerminatedException();
            }
          }

          if (count % 100000 == 0 && count > 0)
          {
            Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count);
          }

          count++;

          var parts = line.Split('\t');

          SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);

          //unmatched
          if (flag.HasFlag(SAMFlags.UnmappedQuery))
          {
            continue;
          }

          var qname = parts[SAMFormatConst.QNAME_INDEX];
          SAMChromosomeItem query;
          if (!queries.TryGetValue(qname, out query))
          {
            query = new SAMChromosomeItem();
            query.Qname = qname;
            queries[qname] = query;

            if (options.KeepSequence)
            {
              query.Sequence = parts[SAMFormatConst.SEQ_INDEX];
              if (flag.HasFlag(SAMFlags.QueryOnReverseStrand))
              {
                query.Sequence = SequenceUtils.GetReverseComplementedSequence(query.Sequence);
              }
            }
          }

          var seqname = GetName(parts[SAMFormatConst.RNAME_INDEX]);
          query.Chromosomes.Add(seqname);

          ChromosomeCountSlimItem item;
          if (!chromosomes.TryGetValue(seqname, out item))
          {
            item = new ChromosomeCountSlimItem();
            item.Names.Add(seqname);
            chromosomes[seqname] = item;
            result.Add(item);
          }
          item.Queries.Add(query);

          waitingcount++;
        }

        Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count);
      }

      foreach (var query in queries.Values)
      {
        query.Chromosomes = query.Chromosomes.Distinct().OrderBy(m => m).ToList();
      }

      foreach (var sam in chromosomes.Values)
      {
        sam.Queries = sam.Queries.Distinct().OrderBy(m => m.Qname).ToList();
      }

      if (!string.IsNullOrEmpty(options.PreferPrefix))
      {
        foreach (var query in queries.Values)
        {
          if (query.Chromosomes.Any(l => l.StartsWith(options.PreferPrefix)))
          {
            var chroms = query.Chromosomes.Where(l => l.StartsWith(options.PreferPrefix)).ToArray();
            foreach (var chrom in chroms)
            {
              chromosomes[chrom].Queries.Remove(query);
              query.Chromosomes.Remove(chrom);
            }
          }
        }

        result.RemoveAll(l => l.Queries.Count == 0);
      }
      return result;
    }
        public List <ChromosomeCountSlimItem> ReadFromFile(string fileName)
        {
            var result = new List <ChromosomeCountSlimItem>();

            using (XmlReader source = XmlReader.Create(fileName))
            {
                //Progress.SetMessage("read queries ...");
                var queries = new List <SAMChromosomeItem>();

                source.ReadToFollowing("queries");
                if (source.ReadToDescendant("query"))
                {
                    do
                    {
                        var query = new SAMChromosomeItem();
                        queries.Add(query);

                        query.Qname      = source.GetAttribute("name");
                        query.QueryCount = int.Parse(source.GetAttribute("count"));
                        var seqAtrr = source.GetAttribute("seq");
                        if (seqAtrr != null)
                        {
                            query.Sequence = seqAtrr;
                        }
                        var sampleAtrr = source.GetAttribute("sample");
                        if (sampleAtrr != null)
                        {
                            query.Sample = sampleAtrr;
                        }
                        if (source.ReadToDescendant("location"))
                        {
                            do
                            {
                                var seqname = source.GetAttribute("seqname");
                                if (!query.Chromosomes.Contains(seqname))
                                {
                                    query.Chromosomes.Add(seqname);
                                }
                            } while (source.ReadToNextSibling("location"));
                        }
                    } while (source.ReadToNextSibling("query"));
                }

                Progress.SetMessage("{0} queries read.", queries.Count);

                var qmmap = queries.ToDictionary(m => m.Qname);
                queries.Clear();

                //Progress.SetMessage("read chromosomes ...");
                source.ReadToFollowing("subjectResult");
                ChromosomeCountSlimItem item = null;
                while (source.Read())
                {
                    if (source.NodeType == XmlNodeType.Element)
                    {
                        if (source.Name.Equals("subjectGroup"))
                        {
                            item = new ChromosomeCountSlimItem();
                            result.Add(item);
                        }
                        else if (source.Name.Equals("subject"))
                        {
                            item.Names.Add(source.GetAttribute("name"));
                        }
                        else if (source.Name.Equals("query"))
                        {
                            var q = qmmap[source.GetAttribute("qname")];
                            item.Queries.Add(q);
                        }
                    }
                }
                qmmap.Clear();

                Progress.SetMessage("{0} subject groups read.", result.Count);

                result.ForEach(l =>
                {
                    if (l.Names.Count > 1)
                    {
                        l.Names = l.Names.Distinct().ToList();
                    }
                });

                return(result);
            }
        }