public override IEnumerable <string> Process()
        {
            var countfiles = _options.GetCountFiles();

            var reader = new MapItemReader(_options.KeyIndex, _options.ValueIndex, informationIndex: _options.InformationIndex, hasHeader: !_options.HasNoHeader);

            reader.CheckEnd = m => m.StartsWith("__no_feature") || m.StartsWith("no_feature");

            var counts = new List <SampleData>();

            foreach (var file in countfiles)
            {
                Progress.SetMessage("Reading data from {0} ...", file.File);
                var data = reader.ReadFromFile(file.File);
                if (!string.IsNullOrEmpty(_options.KeyRegex))
                {
                    var reg = new Regex(_options.KeyRegex);
                    counts.Add(new SampleData()
                    {
                        Name = file.Name, Data = data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value)
                    });
                }
                else
                {
                    counts.Add(new SampleData()
                    {
                        Name = file.Name, Data = data
                    });
                }
            }

            MapData namemap = null;

            if (File.Exists(_options.MapFile))
            {
                Progress.SetMessage("Reading name map from {0} ...", _options.MapFile);
                namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile);

                if (!string.IsNullOrEmpty(_options.KeyRegex))
                {
                    var reg = new Regex(_options.KeyRegex);
                    namemap.Data = namemap.Data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value);
                }
            }

            if (!string.IsNullOrEmpty(_options.KeyRegex))
            {
                Progress.SetMessage("Filtering key by pattern {0} ...", _options.KeyRegex);
                var reg = new Regex(_options.KeyRegex);
                counts.ForEach(m =>
                {
                    var keys = m.Data.Keys.ToList();
                    foreach (var key in keys)
                    {
                        if (!reg.Match(key).Success)
                        {
                            m.Data.Remove(key);
                        }
                    }
                });
            }

            var features = (from c in counts
                            from k in c.Data.Keys
                            select k).Distinct().OrderBy(m => m).ToList();

            var missing = _options.FillMissingWithZero ? "0" : "NA";

            var outputExtra = _options.ExportExtra && namemap != null && namemap.InfoNames.Count > 0;

            Progress.SetMessage("Writing {0} features to {1} ...", features.Count, _options.OutputFile);
            using (var sw = new StreamWriter(_options.OutputFile))
            {
                sw.Write("Feature");

                if (namemap != null)
                {
                    if (outputExtra)
                    {
                        sw.Write("\t{0}", (from v in namemap.InfoNames select "Feature_" + v).Merge("\t"));
                    }
                    sw.Write("\tFeature_{0}", namemap.ValueName);
                }

                sw.WriteLine("\t" + (from c in counts select c.Name).Merge("\t"));

                foreach (var feature in features)
                {
                    if ((from count in counts
                         where count.Data.ContainsKey(feature)
                         select count.Data[feature]).All(m => string.IsNullOrEmpty(m.Value) || m.Value.Equals("0")))
                    {
                        continue;
                    }

                    sw.Write(feature);
                    if (namemap != null)
                    {
                        var feature2 = feature.StringBefore(".");
                        if (namemap.Data.ContainsKey(feature))
                        {
                            if (outputExtra)
                            {
                                sw.Write("\t{0}", namemap.Data[feature].Informations.Merge("\t"));
                            }
                            sw.Write("\t{0}", namemap.Data[feature].Value);
                        }
                        else
                        {
                            var fea    = feature.StringBefore(":");
                            var suffix = feature.Contains(":") ? ":" + feature.StringAfter(":") : string.Empty;
                            var feas   = fea.Split('+');
                            var values = new List <string>();

                            var findFeature = feas.FirstOrDefault(m => namemap.Data.ContainsKey(m));
                            if (findFeature == null)
                            {
                                if (outputExtra)
                                {
                                    sw.Write("\t{0}", (from f in namemap.InfoNames select string.Empty).Merge("\t"));
                                }
                                sw.Write("\t{0}", feature);
                            }
                            else
                            {
                                if (outputExtra)
                                {
                                    for (int i = 0; i < namemap.InfoNames.Count; i++)
                                    {
                                        sw.Write("\t{0}", (from f in feas
                                                           select namemap.Data.ContainsKey(f) ? namemap.Data[f].Informations[i] : string.Empty).Merge(";"));
                                    }
                                }
                                sw.Write("\t{0}", (from f in feas
                                                   select namemap.Data.ContainsKey(f) ? namemap.Data[f].Value : f).Merge("+") + suffix);
                            }
                        }
                    }

                    foreach (var count in counts)
                    {
                        if (count.Data.ContainsKey(feature))
                        {
                            sw.Write("\t" + count.Data[feature].Value);
                        }
                        else
                        {
                            sw.Write("\t" + missing);
                        }
                    }
                    sw.WriteLine();
                }
            }

            //output proteincoding count table
            bool hasProteinCoding = namemap != null && namemap.InfoNames.Count > 0 && namemap.InfoNames.Contains("gene_biotype");

            if (hasProteinCoding)
            {
                WriteProteincodingFile(_options.OutputFile, ".count");
            }

            if (!_options.NoFPKM)
            {
                bool hasLength = namemap != null && namemap.InfoNames.Count > 0 && namemap.InfoNames.Contains("length");
                if (hasLength)
                {
                    Progress.SetMessage("Calculating FPKM values...");
                    var outputFile = Path.ChangeExtension(_options.OutputFile, ".fpkm.tsv");
                    new HTSeqCountToFPKMCalculator(new HTSeqCountToFPKMCalculatorOptions()
                    {
                        InputFile      = _options.OutputFile,
                        GeneLengthFile = _options.MapFile,
                        KeyRegex       = _options.KeyRegex,
                        OutputFile     = outputFile
                    })
                    {
                        Progress = this.Progress
                    }.Process();

                    if (hasProteinCoding)
                    {
                        WriteProteincodingFile(outputFile, ".tsv");
                    }
                }
            }

            return(new[] { Path.GetFullPath(_options.OutputFile) });
        }
    public override IEnumerable<string> Process()
    {
      var files = GetFiles(_options.InputDir);

      Func<string, string> nameFunc;
      if (string.IsNullOrEmpty(_options.NamePattern))
      {
        nameFunc = Path.GetFileNameWithoutExtension;
      }
      else
      {
        nameFunc = n =>
        {
          var match = Regex.Match(n, _options.NamePattern);
          if (match.Success)
          {
            var values = new List<string>();
            for (int i = 1; i < match.Groups.Count; i++)
            {
              values.Add(match.Groups[i].Value);
            }
            return values.Merge("");
          }
          else
          {
            return n;
          }
        };
      };

      var map = files.GroupBy(m =>
      {
        if (_options.Recursion && _options.UseDirName)
        {
          return nameFunc(Path.GetFileName(Path.GetDirectoryName(m)));
        }
        else
        {
          return nameFunc(Path.GetFileName(m));
        }
      }).ToDictionary(n => n.Key);


      var names = (from k in map.Keys
                   orderby k
                   select k).ToList();

      if (_options.AutoFill)
      {
        var nameMap = names.ToDictionary(l => l, l => l);
        Regex number = new Regex(@"(.+?)(\d+)$");

        var numbers = (from n in names
                       let m = number.Match(n)
                       where m.Success
                       select new { OldName = n, Prefix = m.Groups[1].Value, Value = m.Groups[2].Value }).ToList();

        var numberMax = numbers.Max(l => l.Value.Length);
        foreach (var num in numbers)
        {
          if (num.Value.Length != numberMax)
          {
            nameMap[num.OldName] = num.Prefix + new string('0', numberMax - num.Value.Length) + num.Value;
          }
        }

        map = map.ToDictionary(l => nameMap[l.Key], l => l.Value);
        names = (from k in map.Keys
                 orderby k
                 select k).ToList();
      }

      if (File.Exists(_options.MapFile))
      {
        Progress.SetMessage("Reading name map from {0} ...", _options.MapFile);
        var namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile);
        map = map.ToDictionary(l => namemap.Data[l.Key].Value, l => l.Value);
        names = (from k in map.Keys
                 orderby k
                 select k).ToList();
      }

      var result = new List<string> { "files => {" };
      foreach (var name in names)
      {
        result.Add(string.Format("  \"{0}\" => [{1}],", name, (from l in map[name] select '"' + Path.GetFullPath(l) + '"').Merge(", ")));
      }
      result.Add("},");

      if (string.IsNullOrEmpty(_options.GroupPattern))
        return result;

      var groupmap = names.GroupBy(n =>
      {
        var match = Regex.Match(n, _options.GroupPattern);
        if (!match.Success)
        {
          throw new Exception(string.Format("Cannot find pattern {0} in file {1}", _options.NamePattern, n));
        }


        var values = new List<string>();
        for (var i = 1; i < match.Groups.Count; i++)
        {
          values.Add(match.Groups[i].Value);
        }
        return values.Merge("");

      });

      var gnames = (from k in groupmap
                    orderby k.Key
                    select k).ToList();

      result.Add("groups => {");
      foreach (var name in gnames)
      {
        result.Add(string.Format("  \"{0}\" => [{1}],", name.Key, (from l in name
                                                                   orderby l
                                                                   select '"' + l + '"').Merge(", ")));
      }
      result.Add("},");

      return result;
    }
        private List <FileItem> GetFileItems()
        {
            var files = GetFiles(_options.InputDir);

            if (files.Length == 0)
            {
                throw new Exception("No file found in folder " + _options.InputDir);
            }

            if (_options.Verbose)
            {
                foreach (var file in files)
                {
                    Progress.SetMessage("{0}", file);
                }
            }

            Func <string, string> nameFunc;

            if (string.IsNullOrEmpty(_options.NamePattern))
            {
                nameFunc = Path.GetFileNameWithoutExtension;
            }
            else
            {
                nameFunc = n =>
                {
                    var match = Regex.Match(n, _options.NamePattern);
                    if (match.Success)
                    {
                        var values = new List <string>();
                        for (int i = 1; i < match.Groups.Count; i++)
                        {
                            values.Add(match.Groups[i].Value);
                        }
                        return(values.Merge(""));
                    }
                    else
                    {
                        return(n);
                    }
                };
            };

            var result = files.GroupBy(m =>
            {
                if (_options.InputDir.StartsWith("gs://"))
                {
                    m = m.Replace("gs:/", "");
                }

                if (_options.Recursion && _options.UseDirName)
                {
                    return(nameFunc(Path.GetFileName(Path.GetDirectoryName(m))));
                }
                else
                {
                    return(nameFunc(Path.GetFileName(m)));
                }
            }).ToList().ConvertAll(n => new FileItem()
            {
                SampleName = n.Key,
                FileNames  = n.ToList(),
                GroupName  = string.Empty
            });

            if (_options.Verbose)
            {
                foreach (var file in result)
                {
                    Progress.SetMessage("{0} => {1}", file.SampleName, file.FileNames.Merge(","));
                }
            }

            if (_options.AutoFill)
            {
                var   nameMap = result.ToDictionary(l => l.SampleName, l => l);
                Regex number  = new Regex(@"(.+?)(\d+)$");

                var numbers = (from n in nameMap.Keys
                               let m = number.Match(n)
                                       where m.Success
                                       select new { OldName = n, Prefix = m.Groups[1].Value, Value = m.Groups[2].Value }).ToList();

                var numberMax = numbers.Max(l => l.Value.Length);
                foreach (var num in numbers)
                {
                    if (num.Value.Length != numberMax)
                    {
                        nameMap[num.OldName].SampleName = num.Prefix + new string('0', numberMax - num.Value.Length) + num.Value;
                    }
                }
            }

            MapData namemap = null;

            if (File.Exists(_options.MapFile))
            {
                namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile);
                if (_options.Verbose)
                {
                    Progress.SetMessage("Reading name map from {0} ...", _options.MapFile);
                    Progress.SetMessage("Current sample name ...");
                    foreach (var file in result)
                    {
                        Progress.SetMessage("{0}", file.SampleName);
                    }

                    Progress.SetMessage("New map ...");
                    foreach (var name in namemap.Data)
                    {
                        Progress.SetMessage("{0} => {1}", name.Key, name.Value.Value);
                    }
                }

                var nameMap = result.ToDictionary(l => l.SampleName, l => l);

                var groupIndex = namemap.InfoNames.IndexOf("Group");
                foreach (var name in nameMap.Keys)
                {
                    if (namemap.Data.ContainsKey(name))
                    {
                        nameMap[name].SampleName = namemap.Data[name].Value;
                        if (groupIndex != -1)
                        {
                            nameMap[name].GroupName = namemap.Data[name].Informations[groupIndex].ToString();
                        }
                    }
                    else
                    {
                        throw new Exception(string.Format("Cannot find key {0} in name map file {1}", name, _options.MapFile));
                    }
                }
            }

            if (!string.IsNullOrEmpty(_options.GroupPattern))
            {
                foreach (var file in result)
                {
                    var match = Regex.Match(file.SampleName, _options.GroupPattern);
                    if (!match.Success)
                    {
                        throw new Exception(string.Format("Cannot find pattern {0} in file {1}", _options.NamePattern, file.SampleName));
                    }

                    var values = new List <string>();
                    for (var i = 1; i < match.Groups.Count; i++)
                    {
                        values.Add(match.Groups[i].Value);
                    }
                    file.GroupName = values.Merge("");
                }
            }

            foreach (var file in result)
            {
                if (_options.InputDir.StartsWith("gs://"))
                {
                    file.FileNames = (from f in file.FileNames
                                      select f).ToList();
                }
                else
                {
                    file.FileNames = (from f in file.FileNames
                                      select Path.GetFullPath(f)).ToList();
                }
            }

            result.Sort((m1, m2) => m1.SampleName.CompareTo(m2.SampleName));
            return(result);
        }
Exemple #4
0
    public override IEnumerable<string> Process()
    {
      var countfiles = _options.GetCountFiles();

      var reader = new MapItemReader(_options.KeyIndex, _options.ValueIndex, informationIndex: _options.InformationIndex, hasHeader: !_options.HasNoHeader);
      reader.CheckEnd = m => m.StartsWith("__no_feature") || m.StartsWith("no_feature");

      var counts = new List<SampleData>();
      foreach (var file in countfiles)
      {
        Progress.SetMessage("Reading data from {0} ...", file.File);
        var data = reader.ReadFromFile(file.File);
        if (!string.IsNullOrEmpty(_options.KeyRegex))
        {
          var reg = new Regex(_options.KeyRegex);
          counts.Add(new SampleData() { Name = file.Name, Data = data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value) });
        }
        else
        {
          counts.Add(new SampleData() { Name = file.Name, Data = data });
        }
      }

      MapData namemap = null;
      if (File.Exists(_options.MapFile))
      {
        Progress.SetMessage("Reading name map from {0} ...", _options.MapFile);
        namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile);

        if (!string.IsNullOrEmpty(_options.KeyRegex))
        {
          var reg = new Regex(_options.KeyRegex);
          namemap.Data = namemap.Data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value);
        }
      }

      if (!string.IsNullOrEmpty(_options.KeyRegex))
      {
        Progress.SetMessage("Filtering key by pattern {0} ...", _options.KeyRegex);
        var reg = new Regex(_options.KeyRegex);
        counts.ForEach(m =>
        {
          var keys = m.Data.Keys.ToList();
          foreach (var key in keys)
          {
            if (!reg.Match(key).Success)
            {
              m.Data.Remove(key);
            }
          }
        });
      }

      var features = (from c in counts
                      from k in c.Data.Keys
                      select k).Distinct().OrderBy(m => m).ToList();

      var missing = _options.FillMissingWithZero ? "0" : "NA";

      var outputExtra = _options.ExportExtra && namemap != null && namemap.InfoNames.Count > 0;

      Progress.SetMessage("Writing {0} features to {1} ...", features.Count, _options.OutputFile);
      using (var sw = new StreamWriter(_options.OutputFile))
      {
        sw.Write("Feature");

        if (namemap != null)
        {
          if (outputExtra)
          {
            sw.Write("\t{0}", (from v in namemap.InfoNames select "Feature_" + v).Merge("\t"));
          }
          sw.Write("\tFeature_{0}", namemap.ValueName);
        }

        sw.WriteLine("\t" + (from c in counts select c.Name).Merge("\t"));

        foreach (var feature in features)
        {
          if ((from count in counts
               where count.Data.ContainsKey(feature)
               select count.Data[feature]).All(m => string.IsNullOrEmpty(m.Value) || m.Value.Equals("0")))
          {
            continue;
          }

          sw.Write(feature);
          if (namemap != null)
          {
            var feature2 = feature.StringBefore(".");
            if (namemap.Data.ContainsKey(feature))
            {
              if (outputExtra)
              {
                sw.Write("\t{0}", namemap.Data[feature].Informations.Merge("\t"));
              }
              sw.Write("\t{0}", namemap.Data[feature].Value);
            }
            else
            {
              var fea = feature.StringBefore(":");
              var suffix = feature.Contains(":") ? ":" + feature.StringAfter(":") : string.Empty;
              var feas = fea.Split('+');
              var values = new List<string>();

              var findFeature = feas.FirstOrDefault(m => namemap.Data.ContainsKey(m));
              if (findFeature == null)
              {
                if (outputExtra)
                {
                  sw.Write("\t{0}", (from f in namemap.InfoNames select string.Empty).Merge("\t"));
                }
                sw.Write("\t{0}", feature);
              }
              else
              {
                if (outputExtra)
                {
                  for (int i = 0; i < namemap.InfoNames.Count; i++)
                  {
                    sw.Write("\t{0}", (from f in feas
                                       select namemap.Data.ContainsKey(f) ? namemap.Data[f].Informations[i] : string.Empty).Merge(";"));
                  }
                }
                sw.Write("\t{0}", (from f in feas
                                   select namemap.Data.ContainsKey(f) ? namemap.Data[f].Value : f).Merge("+") + suffix);
              }
            }
          }

          foreach (var count in counts)
          {
            if (count.Data.ContainsKey(feature))
            {
              sw.Write("\t" + count.Data[feature].Value);
            }
            else
            {
              sw.Write("\t" + missing);
            }
          }
          sw.WriteLine();
        }
      }

      if (!_options.NoFPKM)
      {
        if (File.Exists(_options.MapFile))
        {
          bool hasLength = false;
          using (var sr = new StreamReader(_options.MapFile))
          {
            var line = sr.ReadLine();
            if (line != null)
            {
              hasLength = line.Contains("length");
            }
          }

          if (hasLength)
          {
            Progress.SetMessage("Calculating FPKM values...");
            new HTSeqCountToFPKMCalculator(new HTSeqCountToFPKMCalculatorOptions()
            {
              InputFile = _options.OutputFile,
              GeneLengthFile = _options.MapFile,
              KeyRegex = _options.KeyRegex,
              OutputFile = Path.ChangeExtension(_options.OutputFile, ".fpkm.tsv")
            })
            {
              Progress = this.Progress
            }.Process();
          }
        }
      }

      return new[] { Path.GetFullPath(_options.OutputFile) };
    }