Пример #1
0
        public override IEnumerable <string> Process(string fileName)
        {
            IFilter <FastqSequence> filter = options.GetFilter();

            using (GzipTextReader gz1 = new GzipTextReader(options.Gzip, options.FastqFiles[0]))
                using (GzipTextReader gz2 = new GzipTextReader(options.Gzip, options.FastqFiles[1]))
                    using (StreamWriter sw1 = new StreamWriter(options.OutputFiles[0]))
                        using (StreamWriter sw2 = new StreamWriter(options.OutputFiles[1]))
                        {
                            FastqReader reader = new FastqReader();
                            FastqWriter writer = new FastqWriter();
                            var         count  = 0;
                            while (true)
                            {
                                var q1 = reader.Parse(gz1.Reader);
                                var q2 = reader.Parse(gz2.Reader);
                                if (q1 == null || q2 == null)
                                {
                                    break;
                                }

                                count++;

                                if (count % 100000 == 0)
                                {
                                    Progress.SetMessage("{0} reads", count);
                                    if (Progress.IsCancellationPending())
                                    {
                                        throw new UserTerminatedException();
                                    }
                                }

                                if (filter.Accept(q1) && filter.Accept(q2))
                                {
                                    writer.Write(sw1, q1);
                                    writer.Write(sw2, q2);
                                }
                            }
                        }

            return(options.OutputFiles);
        }
Пример #2
0
    public override IEnumerable<string> Process(string fileName)
    {
      IFilter<FastqSequence> filter = options.GetFilter();
      using (GzipTextReader gz1 = new GzipTextReader(options.Gzip, options.FastqFiles[0]))
      using (GzipTextReader gz2 = new GzipTextReader(options.Gzip, options.FastqFiles[1]))
      using (StreamWriter sw1 = new StreamWriter(options.OutputFiles[0]))
      using (StreamWriter sw2 = new StreamWriter(options.OutputFiles[1]))
      {
        FastqReader reader = new FastqReader();
        FastqWriter writer = new FastqWriter();
        var count = 0;
        while (true)
        {
          var q1 = reader.Parse(gz1.Reader);
          var q2 = reader.Parse(gz2.Reader);
          if (q1 == null || q2 == null)
          {
            break;
          }

          count++;

          if (count % 100000 == 0)
          {
            Progress.SetMessage("{0} reads", count);
            if (Progress.IsCancellationPending())
            {
              throw new UserTerminatedException();
            }
          }

          if (filter.Accept(q1) && filter.Accept(q2))
          {
            writer.Write(sw1, q1);
            writer.Write(sw2, q2);
          }
        }
      }

      return options.OutputFiles;
    }
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            var fastqFile = options.OutputFile;

            if (!options.Gunzipped && !fastqFile.ToLower().EndsWith(".gz"))
            {
                fastqFile = fastqFile + ".gz";
            }
            result.Add(fastqFile);

            Dictionary <string, FastqSequence> queries = new Dictionary <string, FastqSequence>();

            Progress.SetMessage("Processing " + options.InputFile + " and writing to " + fastqFile + "...");
            var parser = new FastqReader();
            var writer = new FastqWriter();

            var tmpFile = fastqFile + ".tmp";

            int readcount = 0;

            using (var sr = StreamUtils.GetReader(options.InputFile))
            {
                using (var sw = StreamUtils.GetWriter(tmpFile, !options.Gunzipped))
                {
                    FastqSequence seq;
                    while ((seq = parser.Parse(sr)) != null)
                    {
                        readcount++;
                        if (readcount % 100000 == 0)
                        {
                            Progress.SetMessage("{0} reads processed", readcount);
                        }

                        if (seq.SeqString.Length < options.MinimumReadLength)
                        {
                            continue;
                        }

                        FastqSequence count;
                        if (queries.TryGetValue(seq.SeqString, out count))
                        {
                            count.RepeatCount++;
                            if (options.OutputScores)
                            {
                                count.RepeatScores.Add(seq.Score);
                            }
                            continue;
                        }

                        queries[seq.SeqString] = seq;
                        if (options.OutputScores)
                        {
                            seq.RepeatScores.Add(seq.Score);
                        }

                        writer.Write(sw, seq);
                    }
                }
            }

            Progress.End();

            var countFile = Path.ChangeExtension(fastqFile, ".dupcount");

            result.Add(countFile);
            Progress.SetMessage("sort queries ...");
            var seqs = queries.Values.ToList();

            seqs.Sort((m1, m2) =>
            {
                var res = m2.RepeatCount.CompareTo(m1.RepeatCount);
                if (res == 0)
                {
                    res = m1.SeqString.CompareTo(m2.SeqString);
                }
                return(res);
            });

            Progress.SetMessage("writing duplicate count ...");
            using (StreamWriter sw = new StreamWriter(countFile))
            {
                sw.WriteLine("Query\tCount\tSequence");
                foreach (var seq in seqs)
                {
                    sw.WriteLine("{0}\t{1}\t{2}", seq.Name, seq.RepeatCount, seq.SeqString);
                }
            }

            if (File.Exists(fastqFile))
            {
                File.Delete(fastqFile);
            }
            File.Move(tmpFile, fastqFile);

            if (options.OutputScores)
            {
                Progress.SetMessage("writing score ...");
                var scoreFile = Path.ChangeExtension(fastqFile, ".scores");
                result.Add(scoreFile);
                using (StreamWriter sw = new StreamWriter(scoreFile))
                {
                    sw.WriteLine("Query\tSequence\tPosition\tScores");
                    foreach (var seq in seqs)
                    {
                        sw.WriteLine("{0}\t{1}", seq.Name, seq.SeqString);
                        for (int i = 0; i < seq.SeqString.Length; i++)
                        {
                            Dictionary <char, int> count = new Dictionary <char, int>();
                            foreach (var score in seq.RepeatScores)
                            {
                                int oldcount;
                                if (count.TryGetValue(score[i], out oldcount))
                                {
                                    count[score[i]] = oldcount + 1;
                                }
                                else
                                {
                                    count[score[i]] = 1;
                                }
                            }

                            sw.Write("\t\t{0}\t", i + 1);
                            var keys = (from c in count.Keys orderby c select c).ToList();
                            foreach (var key in keys)
                            {
                                sw.Write("{0}({1})", key, count[key]);
                            }
                            sw.WriteLine();
                        }
                    }
                }
            }

            return(result);
        }
    public override IEnumerable<string> Process()
    {
      var result = new List<string>();

      var gzipped = options.OutputFile.ToLower().EndsWith(".gz");
      result.Add(options.OutputFile);

      Dictionary<string, FastqSequence> queries = new Dictionary<string, FastqSequence>();
      Progress.SetMessage("Processing " + options.InputFile + " and writing to " + options.OutputFile + "...");
      var parser = new FastqReader();
      var writer = new FastqWriter();

      var map = options.GetCountMap();
      StreamWriter swCount = null;
      if (map.HasCountFile)
      {
        var of = options.OutputFile + ".dupcount";
        swCount = new StreamWriter(of);
        swCount.WriteLine("Query\tCount\tSequence");
      }

      int readcount = 0;
      using (var sr = StreamUtils.GetReader(options.InputFile))
      {
        using (var sw = StreamUtils.GetWriter(options.OutputFile, gzipped))
        {
          FastqSequence seq;
          while((seq = parser.Parse(sr)) != null)
          {
            readcount++;
            if (readcount % 100000 == 0)
            {
              Progress.SetMessage("{0} reads processed", readcount);
            }
            var name = seq.Name;
            var sequence = seq.SeqString;
            var score = seq.Score;
            var len = sequence.Length;
            var description = seq.Description;
            var count = map.GetCount(seq.Name);

            for (int i = 0; i < 4; i++)
            {
              var newlen = len - i;
              if (newlen < options.MinimumReadLength)
              {
                break;
              }

              string clipped;
              if (i == 0)
              {
                clipped = string.Empty;
              }
              else
              {
                clipped = sequence.Substring(newlen);
              }

              seq.SeqString = sequence.Substring(0, newlen);
              seq.Score = score.Substring(0, newlen);
              seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped);
              writer.Write(sw, seq);
              if (map.HasCountFile)
              {
                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
              }
            }
          }
        }
      }

      if (map.HasCountFile)
      {
        swCount.Close();
      }

      Progress.End();

      return result;
    }
Пример #5
0
    public override IEnumerable<string> Process()
    {
      var result = new List<string>();

      var fastqFile = options.OutputFile;
      if (!options.Gunzipped && !fastqFile.ToLower().EndsWith(".gz"))
      {
        fastqFile = fastqFile + ".gz";
      }
      result.Add(fastqFile);

      Dictionary<string, FastqSequence> queries = new Dictionary<string, FastqSequence>();
      Progress.SetMessage("Processing " + options.InputFile + " and writing to " + fastqFile + "...");
      var parser = new FastqReader();
      var writer = new FastqWriter();

      var tmpFile = fastqFile + ".tmp";

      int readcount = 0;
      using (var sr = StreamUtils.GetReader(options.InputFile))
      {
        using (var sw = StreamUtils.GetWriter(tmpFile, !options.Gunzipped))
        {
          FastqSequence seq;
          while ((seq = parser.Parse(sr)) != null)
          {
            readcount++;
            if (readcount % 100000 == 0)
            {
              Progress.SetMessage("{0} reads processed", readcount);
            }

            if (seq.SeqString.Length < options.MinimumReadLength)
            {
              continue;
            }

            FastqSequence count;
            if (queries.TryGetValue(seq.SeqString, out count))
            {
              count.RepeatCount++;
              if (options.OutputScores)
              {
                count.RepeatScores.Add(seq.Score);
              }
              continue;
            }

            queries[seq.SeqString] = seq;
            if (options.OutputScores)
            {
              seq.RepeatScores.Add(seq.Score);
            }

            writer.Write(sw, seq);
          }
        }
      }

      Progress.End();

      var countFile = Path.ChangeExtension(fastqFile, ".dupcount");
      result.Add(countFile);
      Progress.SetMessage("sort queries ...");
      var seqs = queries.Values.ToList();
      seqs.Sort((m1, m2) =>
      {
        var res = m2.RepeatCount.CompareTo(m1.RepeatCount);
        if (res == 0)
        {
          res = m1.SeqString.CompareTo(m2.SeqString);
        }
        return res;
      });

      Progress.SetMessage("writing duplicate count ...");
      using (StreamWriter sw = new StreamWriter(countFile))
      {
        sw.WriteLine("Query\tCount\tSequence");
        foreach (var seq in seqs)
        {
          sw.WriteLine("{0}\t{1}\t{2}", seq.Name, seq.RepeatCount, seq.SeqString);
        }
      }

      if (File.Exists(fastqFile))
      {
        File.Delete(fastqFile);
      }
      File.Move(tmpFile, fastqFile);

      if (options.OutputScores)
      {
        Progress.SetMessage("writing score ...");
        var scoreFile = Path.ChangeExtension(fastqFile, ".scores");
        result.Add(scoreFile);
        using (StreamWriter sw = new StreamWriter(scoreFile))
        {
          sw.WriteLine("Query\tSequence\tPosition\tScores");
          foreach (var seq in seqs)
          {
            sw.WriteLine("{0}\t{1}", seq.Name, seq.SeqString);
            for (int i = 0; i < seq.SeqString.Length; i++)
            {
              Dictionary<char, int> count = new Dictionary<char, int>();
              foreach (var score in seq.RepeatScores)
              {
                int oldcount;
                if (count.TryGetValue(score[i], out oldcount))
                {
                  count[score[i]] = oldcount + 1;
                }
                else
                {
                  count[score[i]] = 1;
                }
              }

              sw.Write("\t\t{0}\t", i + 1);
              var keys = (from c in count.Keys orderby c select c).ToList();
              foreach (var key in keys)
              {
                sw.Write("{0}({1})", key, count[key]);
              }
              sw.WriteLine();
            }
          }
        }
      }

      return result;
    }
Пример #6
0
    public override IEnumerable<string> Process()
    {
      var result = new List<string>();

      Console.WriteLine("Read mapping file " + options.MappingFile + "...");

      var lines = File.ReadAllLines(options.MappingFile).Where(m => !string.IsNullOrWhiteSpace(m)).ToList();
      var map = (from l in lines
                 let parts = (from p in l.Split('\t', ' ')
                              let pp = p.Trim()
                              where pp.Length > 0
                              select pp).ToArray()
                 where parts.Length > 1
                 select new BarFile() { Barcode = parts[0], Filename = parts[1] }).ToList();

      var dic = (from k in map
                 select new BarFile() { Barcode = k.Barcode, Filename = Path.Combine(options.OutputDirectory, k.Filename), Stream = null, Count = 0 }).ToDictionary(m => m.Barcode);

      Console.WriteLine("There are " + dic.Count.ToString() + " indecies.");
      foreach (var barcode in dic.Keys.OrderBy(m => m))
      {
        Console.WriteLine("{0}\t{1}", barcode, dic[barcode].Filename);
      }

      try
      {
        result.AddRange(from d in dic select d.Value.Filename);

        var parser = new FastqReader();
        var writer = new FastqWriter();

        var unfound = new Dictionary<string, int>();

        int readcount = 0;
        var reg = new Regex(@".+:\s*(\S+?)\s*$");

        Progress.SetMessage("Processing " + Path.GetFullPath(options.InputFile) + " ...");
        using (var sr = StreamUtils.GetReader(options.InputFile))
        {
          FastqSequence seq;
          while((seq = parser.Parse(sr)) != null)
          {
            //Console.WriteLine("seq = " + seq.Reference);

            readcount++;
            if (readcount % 100000 == 0)
            {
              Progress.SetMessage("{0} reads processed", readcount);
            }

            var m = reg.Match(seq.Reference);
            if (!m.Success)
            {
              throw new Exception("Cannot find index from " + seq.Reference);
            }

            var barcode = m.Groups[1].Value;
            //Console.WriteLine("barcode = " + barcode);
            BarFile file;
            if (dic.TryGetValue(barcode, out file))
            {
              if (file.Stream == null)
              {
                file.Stream = StreamUtils.GetWriter(file.Filename, file.Filename.ToLower().EndsWith(".gz"));
              }

              if (!options.UntrimTerminalN)
              {
                while (seq.SeqString.Length > 0 && seq.SeqString.Last() == 'N')
                {
                  seq.SeqString = seq.SeqString.Substring(0, seq.SeqString.Length - 1);
                  seq.Score = seq.Score.Substring(0, seq.Score.Length - 1);
                }

                while (seq.SeqString.Length > 0 && seq.SeqString.First() == 'N')
                {
                  seq.SeqString = seq.SeqString.Substring(1);
                  seq.Score = seq.Score.Substring(1);
                }
              }

              writer.Write(file.Stream, seq);
              file.Count++;
            }
            else
            {
              int count;
              if (unfound.TryGetValue(barcode, out count))
              {
                unfound[barcode] = count + 1;
              }
              else
              {
                unfound[barcode] = 1;
                //Console.WriteLine("Barcode " + barcode + " is not defined in map file, ignored.");
              }
            }
          }
        }

        using (var sw = new StreamWriter(Path.Combine(options.OutputDirectory, options.SummaryFile)))
        {
          sw.WriteLine("Type\tIndex\tCount");
          foreach (var d in dic.Keys.OrderBy(m => m))
          {
            sw.WriteLine("Sample\t{0}\t{1}", dic[d].Barcode, dic[d].Count);
          }

          foreach (var d in unfound.OrderByDescending(m => m.Value))
          {
            sw.WriteLine("Unmapped\t{0}\t{1}", d.Key, d.Value);
          }
        }
      }
      finally
      {
        foreach (var d in dic)
        {
          if (null != d.Value.Stream)
          {
            d.Value.Stream.Close();
          }
        }
      }

      Progress.End();

      return result;
    }
    private void DoProcess(Func<FastqSequence, bool> accept, SmallRNACountMap map, string outputFile, Dictionary<int, CountItem> dic)
    {
      Progress.SetMessage("Processing " + options.InputFile + " and writing to " + outputFile + "...");

      var ccaMap = new MapItemReader(0, 1).ReadFromFile(options.CCAFile).ToDictionary(m => m.Key, m => bool.Parse(m.Value.Value));

      var parser = new FastqReader();
      var writer = new FastqWriter();

      StreamWriter swCount = null;
      if (map.HasCountFile)
      {
        swCount = new StreamWriter(outputFile + ".dupcount");
        swCount.WriteLine("Query\tCount\tSequence");
      }

      try
      {
        int readcount = 0;
        var tmpFile = outputFile + ".tmp";
        using (var sr = StreamUtils.GetReader(options.InputFile))
        {
          using (var sw = StreamUtils.GetWriter(tmpFile, outputFile.ToLower().EndsWith(".gz")))
          {
            FastqSequence seq;
            while ((seq = parser.Parse(sr)) != null)
            {
              readcount++;
              if (readcount % 100000 == 0)
              {
                Progress.SetMessage("{0} reads processed", readcount);
              }

              if (!accept(seq))
              {
                continue;
              }

              var name = seq.Name;
              var sequence = seq.SeqString;
              var score = seq.Score;
              var len = sequence.Length;
              var description = seq.Description;
              var count = map.GetCount(seq.Name);

              if (map.HasCountFile)
              {
                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
              }

              CountItem item;
              if (!dic.TryGetValue(sequence.Length, out item))
              {
                item = new CountItem();
                dic[sequence.Length] = item;
              }

              string clipped;
              if (sequence.EndsWith("CCAA"))
              {
                clipped = "CCAA";
                sequence = sequence.Substring(0, sequence.Length - 4);
                item.CCAA += count;
              }
              else if (sequence.EndsWith("CCA"))
              {
                clipped = "CCA";
                sequence = sequence.Substring(0, sequence.Length - 3);
                item.CCA += count;
              }
              else if (sequence.EndsWith("CC"))
              {
                bool isCCA;
                if (ccaMap.TryGetValue(name, out isCCA) && isCCA)
                {
                  clipped = "CC";
                  sequence = sequence.Substring(0, sequence.Length - 2);
                  item.CC += count;
                }
                else
                {
                  clipped = string.Empty;
                  item.notNTA += count;
                }
              }
              else
              {
                clipped = string.Empty;
                item.notNTA += count;
              }

              if (!string.IsNullOrEmpty(clipped))
              {
                var newlen = sequence.Length;
                seq.SeqString = sequence;
                seq.Score = score.Substring(0, newlen);
                seq.Reference = string.Format("{0}{1}{2}", name, SmallRNAConsts.NTA_TAG, clipped);
              }
              else
              {
                seq.Reference = string.Format("{0}{1}", name, SmallRNAConsts.NTA_TAG);
              }
              writer.Write(sw, seq);
              if (map.HasCountFile)
              {
                swCount.WriteLine("{0}\t{1}\t{2}", seq.Name, count, seq.SeqString);
              }
            }
          }
        }

        File.Move(tmpFile, outputFile);
      }
      finally
      {
        if (map.HasCountFile)
        {
          swCount.Close();
        }
      }
    }
Пример #8
0
        public int Extract(string sourceFile, string targetFile, IEnumerable <string> exceptQueryNames, string countFile)
        {
            int result = 0;

            var except = new HashSet <string>(exceptQueryNames);

            SmallRNACountMap cm      = new SmallRNACountMap();
            StreamWriter     swCount = null;

            if (File.Exists(countFile))
            {
                var oldCm = new SmallRNACountMap(countFile);
                foreach (var c in oldCm.Counts)
                {
                    cm.Counts[c.Key.StringBefore(SmallRNAConsts.NTA_TAG)] = c.Value;
                }
                swCount = new StreamWriter(targetFile + ".dupcount");
            }

            try
            {
                using (var sw = StreamUtils.GetWriter(targetFile, targetFile.ToLower().EndsWith(".gz")))
                {
                    using (var sr = StreamUtils.GetReader(sourceFile))
                    {
                        FastqReader reader = new FastqReader();
                        FastqWriter writer = new FastqWriter();

                        FastqSequence ss;
                        var           count = 0;
                        while ((ss = reader.Parse(sr)) != null)
                        {
                            count++;

                            if (count % 100000 == 0)
                            {
                                Progress.SetMessage("{0} reads", count);
                                if (Progress.IsCancellationPending())
                                {
                                    throw new UserTerminatedException();
                                }
                            }

                            ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description;
                            if (except.Contains(ss.Name))
                            {
                                continue;
                            }

                            if (Filter != null && !Filter.Accept(ss))
                            {
                                continue;
                            }

                            except.Add(ss.Name);
                            writer.Write(sw, ss);

                            if (swCount != null)
                            {
                                swCount.WriteLine("{0}\t{1}", ss.Name, cm.Counts[ss.Name]);
                            }

                            result++;
                        }
                    }
                }
            }
            finally
            {
                if (swCount != null)
                {
                    swCount.Close();
                }
            }
            return(result);
        }
    public override IEnumerable<string> Process()
    {
      var result = new List<string>();

      var except = new HashSet<string>();
      if (File.Exists(options.XmlFile))
      {
        //exclude the reads mapped to features no matter how many number of mismatch it has
        var allmapped = new FeatureItemGroupXmlFormat().ReadFromFile(options.XmlFile);
        except.UnionWith(from g in allmapped
                         from f in g
                         from l in f.Locations
                         from sl in l.SamLocations
                         select sl.SamLocation.Parent.Qname.StringBefore(SmallRNAConsts.NTA_TAG));
      }

      if (File.Exists(options.ExcludeFile))
      {
        except.UnionWith(from l in File.ReadAllLines(options.ExcludeFile)
                         select l.StringBefore(SmallRNAConsts.NTA_TAG));
      }

      SmallRNACountMap cm = options.GetCountMap();
      var keys = cm.Counts.Keys.Where(m => m.Contains(SmallRNAConsts.NTA_TAG)).ToArray();
      foreach (var key in keys)
      {
        cm.Counts[key.StringBefore(SmallRNAConsts.NTA_TAG)] = cm.Counts[key];
      }
      StreamWriter swCount = null;
      if (File.Exists(options.CountFile))
      {
        swCount = new StreamWriter(options.OutputFile + ".dupcount");
      }

      Progress.SetMessage("output unmapped query...");
      try
      {
        using (var sw = StreamUtils.GetWriter(options.OutputFile, options.OutputFile.ToLower().EndsWith(".gz")))
        {
          using (var sr = StreamUtils.GetReader(options.InputFile))
          {
            FastqReader reader = new FastqReader();
            FastqWriter writer = new FastqWriter();

            FastqSequence ss;
            var count = 0;
            while ((ss = reader.Parse(sr)) != null)
            {
              count++;

              if (count % 100000 == 0)
              {
                Progress.SetMessage("{0} reads", count);
                if (Progress.IsCancellationPending())
                {
                  throw new UserTerminatedException();
                }
              }

              ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description;
              if (except.Contains(ss.Name))
              {
                continue;
              }

              if (Accept != null && !Accept(ss))
              {
                continue;
              }

              except.Add(ss.Name);
              writer.Write(sw, ss);

              if (swCount != null)
              {
                int cmcount;
                if (!cm.Counts.TryGetValue(ss.Name, out cmcount))
                {
                  throw new Exception(string.Format("Cannot find {0} in count map", ss.Name));
                }
                swCount.WriteLine("{0}\t{1}", ss.Name, cmcount);
              }
            }
          }
        }
      }
      finally
      {
        if (swCount != null)
        {
          swCount.Close();
        }
      }

      Progress.End();

      return result;
    }
Пример #10
0
    public int Extract(string sourceFile, string targetFile, IEnumerable<string> exceptQueryNames, string countFile)
    {
      int result = 0;

      var except = new HashSet<string>(exceptQueryNames);

      SmallRNACountMap cm = new SmallRNACountMap();
      StreamWriter swCount = null;
      if (File.Exists(countFile))
      {
        var oldCm = new SmallRNACountMap(countFile);
        foreach (var c in oldCm.Counts)
        {
          cm.Counts[c.Key.StringBefore(SmallRNAConsts.NTA_TAG)] = c.Value;
        }
        swCount = new StreamWriter(targetFile + ".dupcount");
      }

      try
      {
        using (var sw = StreamUtils.GetWriter(targetFile, targetFile.ToLower().EndsWith(".gz")))
        {
          using (var sr = StreamUtils.GetReader(sourceFile))
          {
            FastqReader reader = new FastqReader();
            FastqWriter writer = new FastqWriter();

            FastqSequence ss;
            var count = 0;
            while ((ss = reader.Parse(sr)) != null)
            {
              count++;

              if (count % 100000 == 0)
              {
                Progress.SetMessage("{0} reads", count);
                if (Progress.IsCancellationPending())
                {
                  throw new UserTerminatedException();
                }
              }

              ss.Reference = ss.Name.StringBefore(SmallRNAConsts.NTA_TAG) + " " + ss.Description;
              if (except.Contains(ss.Name))
              {
                continue;
              }

              if (Filter != null && !Filter.Accept(ss))
              {
                continue;
              }

              except.Add(ss.Name);
              writer.Write(sw, ss);

              if (swCount != null)
              {
                swCount.WriteLine("{0}\t{1}", ss.Name, cm.Counts[ss.Name]);
              }

              result++;
            }
          }
        }
      }
      finally
      {
        if (swCount != null)
        {
          swCount.Close();
        }
      }
      return result;
    }
Пример #11
0
        public override IEnumerable <string> Process()
        {
            var result = new List <string>();

            Console.WriteLine("Read mapping file " + options.MappingFile + "...");

            var lines = File.ReadAllLines(options.MappingFile).Where(m => !string.IsNullOrWhiteSpace(m)).ToList();
            var map   = (from l in lines
                         let parts = (from p in l.Split('\t', ' ')
                                      let pp = p.Trim()
                                               where pp.Length > 0
                                               select pp).ToArray()
                                     where parts.Length > 1
                                     select new BarFile()
            {
                Barcode = parts[0], Filename = parts[1]
            }).ToList();

            var dic = (from k in map
                       select new BarFile()
            {
                Barcode = k.Barcode, Filename = Path.Combine(options.OutputDirectory, k.Filename), Stream = null, Count = 0
            }).ToDictionary(m => m.Barcode);

            Console.WriteLine("There are " + dic.Count.ToString() + " indecies.");
            foreach (var barcode in dic.Keys.OrderBy(m => m))
            {
                Console.WriteLine("{0}\t{1}", barcode, dic[barcode].Filename);
            }

            try
            {
                result.AddRange(from d in dic select d.Value.Filename);

                var parser = new FastqReader();
                var writer = new FastqWriter();

                var unfound = new Dictionary <string, int>();

                int readcount = 0;
                var reg       = new Regex(@".+:\s*(\S+?)\s*$");

                Progress.SetMessage("Processing " + Path.GetFullPath(options.InputFile) + " ...");
                using (var sr = StreamUtils.GetReader(options.InputFile))
                {
                    FastqSequence seq;
                    while ((seq = parser.Parse(sr)) != null)
                    {
                        //Console.WriteLine("seq = " + seq.Reference);

                        readcount++;
                        if (readcount % 100000 == 0)
                        {
                            Progress.SetMessage("{0} reads processed", readcount);
                        }

                        var m = reg.Match(seq.Reference);
                        if (!m.Success)
                        {
                            throw new Exception("Cannot find index from " + seq.Reference);
                        }

                        var barcode = m.Groups[1].Value;
                        //Console.WriteLine("barcode = " + barcode);
                        BarFile file;
                        if (dic.TryGetValue(barcode, out file))
                        {
                            if (file.Stream == null)
                            {
                                file.Stream = StreamUtils.GetWriter(file.Filename, file.Filename.ToLower().EndsWith(".gz"));
                            }

                            if (!options.UntrimTerminalN)
                            {
                                while (seq.SeqString.Length > 0 && seq.SeqString.Last() == 'N')
                                {
                                    seq.SeqString = seq.SeqString.Substring(0, seq.SeqString.Length - 1);
                                    seq.Score     = seq.Score.Substring(0, seq.Score.Length - 1);
                                }

                                while (seq.SeqString.Length > 0 && seq.SeqString.First() == 'N')
                                {
                                    seq.SeqString = seq.SeqString.Substring(1);
                                    seq.Score     = seq.Score.Substring(1);
                                }
                            }

                            writer.Write(file.Stream, seq);
                            file.Count++;
                        }
                        else
                        {
                            int count;
                            if (unfound.TryGetValue(barcode, out count))
                            {
                                unfound[barcode] = count + 1;
                            }
                            else
                            {
                                unfound[barcode] = 1;
                                //Console.WriteLine("Barcode " + barcode + " is not defined in map file, ignored.");
                            }
                        }
                    }
                }

                using (var sw = new StreamWriter(Path.Combine(options.OutputDirectory, options.SummaryFile)))
                {
                    sw.WriteLine("Type\tIndex\tCount");
                    foreach (var d in dic.Keys.OrderBy(m => m))
                    {
                        sw.WriteLine("Sample\t{0}\t{1}", dic[d].Barcode, dic[d].Count);
                    }

                    foreach (var d in unfound.OrderByDescending(m => m.Value))
                    {
                        sw.WriteLine("Unmapped\t{0}\t{1}", d.Key, d.Value);
                    }
                }
            }
            finally
            {
                foreach (var d in dic)
                {
                    if (null != d.Value.Stream)
                    {
                        d.Value.Stream.Close();
                    }
                }
            }

            Progress.End();

            return(result);
        }
Пример #12
0
    public override IEnumerable<string> Process()
    {
      var result = new List<string>();

      result.AddRange(options.OutputFiles);

      Progress.SetMessage("Processing " + options.InputFiles.Merge(",") + " and writing to " + options.OutputFiles.Merge(",") + "...");
      var writer = new FastqWriter();

      int readcount = 0;
      var srs = new List<StreamReader>();
      var sws = new List<StreamWriter>();
      var parsers = new FastqReader();
      try
      {
        Progress.SetMessage("Opening input files ...");
        srs.AddRange(from input in options.InputFiles select StreamUtils.GetReader(input));

        Progress.SetMessage("Opening output files ...");
        sws.AddRange(from output in options.OutputFiles select StreamUtils.GetWriter(output, output.ToLower().EndsWith(".gz") || options.Gzipped));

        Progress.SetMessage("Reading sequences ...");
        while (true)
        {
          var seqs = (from sr in srs select parsers.Parse(sr)).ToArray();

          if (seqs.All(m => m == null))
          {
            break;
          }

          if (seqs.Any(m => m == null))
          {
            throw new Exception("The data is not properly paired :" + (from s in seqs where s != null select s.Name).Merge(" ! "));
          }

          if (seqs.Length > 1)
          {
            var names = (from seq in seqs
                         select seq.Name.StringBefore(" ").StringBefore("/1").StringBefore("/2")).ToArray();
            if (names.Any(m => !m.Equals(names[0])))
            {
              throw new Exception("The data is not properly paired: " + names.Merge(" ! "));
            }
          }

          readcount++;
          if (readcount % 100000 == 0)
          {
            Progress.SetMessage("{0} reads processed", readcount);
          }

          if (options.Last > 0)
          {
            seqs.ForEach(seq =>
            {
              seq.SeqString = seq.SeqString.Substring(0, options.Last);
              seq.Score = seq.Score.Substring(0, options.Last);
            });
          }

          if (options.Start > 1)
          {
            seqs.ForEach(seq =>
            {
              seq.SeqString = seq.SeqString.Substring(options.Start - 1);
              seq.Score = seq.Score.Substring(options.Start - 1);
            });
          }

          if (options.TrimN)
          {
            seqs.ForEach(seq =>
            {
              while (seq.SeqString.StartsWith("N"))
              {
                seq.SeqString = seq.SeqString.Substring(1);
                seq.Score = seq.Score.Substring(1);
              }
              while (seq.SeqString.EndsWith("N"))
              {
                seq.SeqString = seq.SeqString.Substring(0, seq.SeqString.Length - 1);
                seq.Score = seq.Score.Substring(0, seq.Score.Length - 1);
              }
            });
          }

          if (options.MinimumLength > 0 && seqs.Any(m => m.SeqString.Length < options.MinimumLength))
          {
            continue;
          }

          for (int i = 0; i < seqs.Length; i++)
          {
            writer.Write(sws[i], seqs[i]);
          }
        }
      }
      finally
      {
        srs.ForEach(m => m.Close());
        sws.ForEach(m => m.Close());
      }

      Progress.End();

      return result;
    }
Пример #13
0
    public override IEnumerable<string> Process()
    {
      var parser = new FastqReader();

      Progress.SetMessage("Reading " + options.InputFile + "...");
      var ccs = new Dictionary<string, string>();
      using (var sr = StreamUtils.GetReader(options.InputFile))
      {
        FastqSequence seq;
        int readcount = 0;
        while ((seq = parser.Parse(sr)) != null)
        {
          readcount++;
          if (readcount % 100000 == 0)
          {
            Progress.SetMessage("{0} / {1} reads end with CC found", ccs.Count, readcount);
          }

          if (seq.SeqString.EndsWith("CC"))
          {
            ccs[seq.Name] = seq.SeqString;
          }
        }
      }

      Dictionary<string, FastqSequence> queries = new Dictionary<string, FastqSequence>();
      Progress.SetMessage("Processing " + options.UntrimmedFile + " and writing to " + options.OutputFile + "...");
      var writer = new FastqWriter();
      using (var sr = StreamUtils.GetReader(options.UntrimmedFile))
      {
        using (var sw = new StreamWriter(options.OutputFile))
        {
          sw.WriteLine("Name\tIsCCA");
          FastqSequence seq;
          int readcount = 0;

          parser.AcceptName = m => ccs.ContainsKey(m);

          while ((seq = parser.Parse(sr)) != null)
          {
            readcount++;
            if (readcount % 10000 == 0)
            {
              Progress.SetMessage("{0} reads end with CC processed", readcount);
            }

            string sequence = ccs[seq.Name];
            var pos = seq.SeqString.IndexOf(sequence);
            if (pos == -1)
            {
              throw new Exception(string.Format("Cannot find trimmed sequence {0} in untrimmed sequence {1} of read {2}", sequence, seq.SeqString, seq.Name));
            }

            var nextseq = seq.SeqString.Substring(pos + sequence.Length);
            sw.WriteLine("{0}\t{1}", seq.Name, nextseq.StartsWith("A"));
            ccs.Remove(seq.Name);
          }
        }
      }

      if (ccs.Count != 0)
      {
        var unfoundFile = options.OutputFile + ".unfound";
        using (var sw = new StreamWriter(unfoundFile))
        {
          ccs.ForEach(m => sw.WriteLine(m.Key));
        }
        throw new Exception(string.Format("Cannot find {0} reads in untrimmed file, saved to {1}", ccs.Count, unfoundFile));
      }

      Progress.End();

      return new[] { options.OutputFile };
    }