public override IEnumerable <string> Process() { var countfiles = _options.GetCountFiles(); var reader = new MapItemReader(_options.KeyIndex, _options.ValueIndex, informationIndex: _options.InformationIndex, hasHeader: !_options.HasNoHeader); reader.CheckEnd = m => m.StartsWith("__no_feature") || m.StartsWith("no_feature"); var counts = new List <SampleData>(); foreach (var file in countfiles) { Progress.SetMessage("Reading data from {0} ...", file.File); var data = reader.ReadFromFile(file.File); if (!string.IsNullOrEmpty(_options.KeyRegex)) { var reg = new Regex(_options.KeyRegex); counts.Add(new SampleData() { Name = file.Name, Data = data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value) }); } else { counts.Add(new SampleData() { Name = file.Name, Data = data }); } } MapData namemap = null; if (File.Exists(_options.MapFile)) { Progress.SetMessage("Reading name map from {0} ...", _options.MapFile); namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile); if (!string.IsNullOrEmpty(_options.KeyRegex)) { var reg = new Regex(_options.KeyRegex); namemap.Data = namemap.Data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value); } } if (!string.IsNullOrEmpty(_options.KeyRegex)) { Progress.SetMessage("Filtering key by pattern {0} ...", _options.KeyRegex); var reg = new Regex(_options.KeyRegex); counts.ForEach(m => { var keys = m.Data.Keys.ToList(); foreach (var key in keys) { if (!reg.Match(key).Success) { m.Data.Remove(key); } } }); } var features = (from c in counts from k in c.Data.Keys select k).Distinct().OrderBy(m => m).ToList(); var missing = _options.FillMissingWithZero ? "0" : "NA"; var outputExtra = _options.ExportExtra && namemap != null && namemap.InfoNames.Count > 0; Progress.SetMessage("Writing {0} features to {1} ...", features.Count, _options.OutputFile); using (var sw = new StreamWriter(_options.OutputFile)) { sw.Write("Feature"); if (namemap != null) { if (outputExtra) { sw.Write("\t{0}", (from v in namemap.InfoNames select "Feature_" + v).Merge("\t")); } sw.Write("\tFeature_{0}", namemap.ValueName); } sw.WriteLine("\t" + (from c in counts select c.Name).Merge("\t")); foreach (var feature in features) { if ((from count in counts where count.Data.ContainsKey(feature) select count.Data[feature]).All(m => string.IsNullOrEmpty(m.Value) || m.Value.Equals("0"))) { continue; } sw.Write(feature); if (namemap != null) { var feature2 = feature.StringBefore("."); if (namemap.Data.ContainsKey(feature)) { if (outputExtra) { sw.Write("\t{0}", namemap.Data[feature].Informations.Merge("\t")); } sw.Write("\t{0}", namemap.Data[feature].Value); } else { var fea = feature.StringBefore(":"); var suffix = feature.Contains(":") ? ":" + feature.StringAfter(":") : string.Empty; var feas = fea.Split('+'); var values = new List <string>(); var findFeature = feas.FirstOrDefault(m => namemap.Data.ContainsKey(m)); if (findFeature == null) { if (outputExtra) { sw.Write("\t{0}", (from f in namemap.InfoNames select string.Empty).Merge("\t")); } sw.Write("\t{0}", feature); } else { if (outputExtra) { for (int i = 0; i < namemap.InfoNames.Count; i++) { sw.Write("\t{0}", (from f in feas select namemap.Data.ContainsKey(f) ? namemap.Data[f].Informations[i] : string.Empty).Merge(";")); } } sw.Write("\t{0}", (from f in feas select namemap.Data.ContainsKey(f) ? namemap.Data[f].Value : f).Merge("+") + suffix); } } } foreach (var count in counts) { if (count.Data.ContainsKey(feature)) { sw.Write("\t" + count.Data[feature].Value); } else { sw.Write("\t" + missing); } } sw.WriteLine(); } } //output proteincoding count table bool hasProteinCoding = namemap != null && namemap.InfoNames.Count > 0 && namemap.InfoNames.Contains("gene_biotype"); if (hasProteinCoding) { WriteProteincodingFile(_options.OutputFile, ".count"); } if (!_options.NoFPKM) { bool hasLength = namemap != null && namemap.InfoNames.Count > 0 && namemap.InfoNames.Contains("length"); if (hasLength) { Progress.SetMessage("Calculating FPKM values..."); var outputFile = Path.ChangeExtension(_options.OutputFile, ".fpkm.tsv"); new HTSeqCountToFPKMCalculator(new HTSeqCountToFPKMCalculatorOptions() { InputFile = _options.OutputFile, GeneLengthFile = _options.MapFile, KeyRegex = _options.KeyRegex, OutputFile = outputFile }) { Progress = this.Progress }.Process(); if (hasProteinCoding) { WriteProteincodingFile(outputFile, ".tsv"); } } } return(new[] { Path.GetFullPath(_options.OutputFile) }); }
public override IEnumerable<string> Process() { var files = GetFiles(_options.InputDir); Func<string, string> nameFunc; if (string.IsNullOrEmpty(_options.NamePattern)) { nameFunc = Path.GetFileNameWithoutExtension; } else { nameFunc = n => { var match = Regex.Match(n, _options.NamePattern); if (match.Success) { var values = new List<string>(); for (int i = 1; i < match.Groups.Count; i++) { values.Add(match.Groups[i].Value); } return values.Merge(""); } else { return n; } }; }; var map = files.GroupBy(m => { if (_options.Recursion && _options.UseDirName) { return nameFunc(Path.GetFileName(Path.GetDirectoryName(m))); } else { return nameFunc(Path.GetFileName(m)); } }).ToDictionary(n => n.Key); var names = (from k in map.Keys orderby k select k).ToList(); if (_options.AutoFill) { var nameMap = names.ToDictionary(l => l, l => l); Regex number = new Regex(@"(.+?)(\d+)$"); var numbers = (from n in names let m = number.Match(n) where m.Success select new { OldName = n, Prefix = m.Groups[1].Value, Value = m.Groups[2].Value }).ToList(); var numberMax = numbers.Max(l => l.Value.Length); foreach (var num in numbers) { if (num.Value.Length != numberMax) { nameMap[num.OldName] = num.Prefix + new string('0', numberMax - num.Value.Length) + num.Value; } } map = map.ToDictionary(l => nameMap[l.Key], l => l.Value); names = (from k in map.Keys orderby k select k).ToList(); } if (File.Exists(_options.MapFile)) { Progress.SetMessage("Reading name map from {0} ...", _options.MapFile); var namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile); map = map.ToDictionary(l => namemap.Data[l.Key].Value, l => l.Value); names = (from k in map.Keys orderby k select k).ToList(); } var result = new List<string> { "files => {" }; foreach (var name in names) { result.Add(string.Format(" \"{0}\" => [{1}],", name, (from l in map[name] select '"' + Path.GetFullPath(l) + '"').Merge(", "))); } result.Add("},"); if (string.IsNullOrEmpty(_options.GroupPattern)) return result; var groupmap = names.GroupBy(n => { var match = Regex.Match(n, _options.GroupPattern); if (!match.Success) { throw new Exception(string.Format("Cannot find pattern {0} in file {1}", _options.NamePattern, n)); } var values = new List<string>(); for (var i = 1; i < match.Groups.Count; i++) { values.Add(match.Groups[i].Value); } return values.Merge(""); }); var gnames = (from k in groupmap orderby k.Key select k).ToList(); result.Add("groups => {"); foreach (var name in gnames) { result.Add(string.Format(" \"{0}\" => [{1}],", name.Key, (from l in name orderby l select '"' + l + '"').Merge(", "))); } result.Add("},"); return result; }
private List <FileItem> GetFileItems() { var files = GetFiles(_options.InputDir); if (files.Length == 0) { throw new Exception("No file found in folder " + _options.InputDir); } if (_options.Verbose) { foreach (var file in files) { Progress.SetMessage("{0}", file); } } Func <string, string> nameFunc; if (string.IsNullOrEmpty(_options.NamePattern)) { nameFunc = Path.GetFileNameWithoutExtension; } else { nameFunc = n => { var match = Regex.Match(n, _options.NamePattern); if (match.Success) { var values = new List <string>(); for (int i = 1; i < match.Groups.Count; i++) { values.Add(match.Groups[i].Value); } return(values.Merge("")); } else { return(n); } }; }; var result = files.GroupBy(m => { if (_options.InputDir.StartsWith("gs://")) { m = m.Replace("gs:/", ""); } if (_options.Recursion && _options.UseDirName) { return(nameFunc(Path.GetFileName(Path.GetDirectoryName(m)))); } else { return(nameFunc(Path.GetFileName(m))); } }).ToList().ConvertAll(n => new FileItem() { SampleName = n.Key, FileNames = n.ToList(), GroupName = string.Empty }); if (_options.Verbose) { foreach (var file in result) { Progress.SetMessage("{0} => {1}", file.SampleName, file.FileNames.Merge(",")); } } if (_options.AutoFill) { var nameMap = result.ToDictionary(l => l.SampleName, l => l); Regex number = new Regex(@"(.+?)(\d+)$"); var numbers = (from n in nameMap.Keys let m = number.Match(n) where m.Success select new { OldName = n, Prefix = m.Groups[1].Value, Value = m.Groups[2].Value }).ToList(); var numberMax = numbers.Max(l => l.Value.Length); foreach (var num in numbers) { if (num.Value.Length != numberMax) { nameMap[num.OldName].SampleName = num.Prefix + new string('0', numberMax - num.Value.Length) + num.Value; } } } MapData namemap = null; if (File.Exists(_options.MapFile)) { namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile); if (_options.Verbose) { Progress.SetMessage("Reading name map from {0} ...", _options.MapFile); Progress.SetMessage("Current sample name ..."); foreach (var file in result) { Progress.SetMessage("{0}", file.SampleName); } Progress.SetMessage("New map ..."); foreach (var name in namemap.Data) { Progress.SetMessage("{0} => {1}", name.Key, name.Value.Value); } } var nameMap = result.ToDictionary(l => l.SampleName, l => l); var groupIndex = namemap.InfoNames.IndexOf("Group"); foreach (var name in nameMap.Keys) { if (namemap.Data.ContainsKey(name)) { nameMap[name].SampleName = namemap.Data[name].Value; if (groupIndex != -1) { nameMap[name].GroupName = namemap.Data[name].Informations[groupIndex].ToString(); } } else { throw new Exception(string.Format("Cannot find key {0} in name map file {1}", name, _options.MapFile)); } } } if (!string.IsNullOrEmpty(_options.GroupPattern)) { foreach (var file in result) { var match = Regex.Match(file.SampleName, _options.GroupPattern); if (!match.Success) { throw new Exception(string.Format("Cannot find pattern {0} in file {1}", _options.NamePattern, file.SampleName)); } var values = new List <string>(); for (var i = 1; i < match.Groups.Count; i++) { values.Add(match.Groups[i].Value); } file.GroupName = values.Merge(""); } } foreach (var file in result) { if (_options.InputDir.StartsWith("gs://")) { file.FileNames = (from f in file.FileNames select f).ToList(); } else { file.FileNames = (from f in file.FileNames select Path.GetFullPath(f)).ToList(); } } result.Sort((m1, m2) => m1.SampleName.CompareTo(m2.SampleName)); return(result); }
public override IEnumerable<string> Process() { var countfiles = _options.GetCountFiles(); var reader = new MapItemReader(_options.KeyIndex, _options.ValueIndex, informationIndex: _options.InformationIndex, hasHeader: !_options.HasNoHeader); reader.CheckEnd = m => m.StartsWith("__no_feature") || m.StartsWith("no_feature"); var counts = new List<SampleData>(); foreach (var file in countfiles) { Progress.SetMessage("Reading data from {0} ...", file.File); var data = reader.ReadFromFile(file.File); if (!string.IsNullOrEmpty(_options.KeyRegex)) { var reg = new Regex(_options.KeyRegex); counts.Add(new SampleData() { Name = file.Name, Data = data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value) }); } else { counts.Add(new SampleData() { Name = file.Name, Data = data }); } } MapData namemap = null; if (File.Exists(_options.MapFile)) { Progress.SetMessage("Reading name map from {0} ...", _options.MapFile); namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile); if (!string.IsNullOrEmpty(_options.KeyRegex)) { var reg = new Regex(_options.KeyRegex); namemap.Data = namemap.Data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value); } } if (!string.IsNullOrEmpty(_options.KeyRegex)) { Progress.SetMessage("Filtering key by pattern {0} ...", _options.KeyRegex); var reg = new Regex(_options.KeyRegex); counts.ForEach(m => { var keys = m.Data.Keys.ToList(); foreach (var key in keys) { if (!reg.Match(key).Success) { m.Data.Remove(key); } } }); } var features = (from c in counts from k in c.Data.Keys select k).Distinct().OrderBy(m => m).ToList(); var missing = _options.FillMissingWithZero ? "0" : "NA"; var outputExtra = _options.ExportExtra && namemap != null && namemap.InfoNames.Count > 0; Progress.SetMessage("Writing {0} features to {1} ...", features.Count, _options.OutputFile); using (var sw = new StreamWriter(_options.OutputFile)) { sw.Write("Feature"); if (namemap != null) { if (outputExtra) { sw.Write("\t{0}", (from v in namemap.InfoNames select "Feature_" + v).Merge("\t")); } sw.Write("\tFeature_{0}", namemap.ValueName); } sw.WriteLine("\t" + (from c in counts select c.Name).Merge("\t")); foreach (var feature in features) { if ((from count in counts where count.Data.ContainsKey(feature) select count.Data[feature]).All(m => string.IsNullOrEmpty(m.Value) || m.Value.Equals("0"))) { continue; } sw.Write(feature); if (namemap != null) { var feature2 = feature.StringBefore("."); if (namemap.Data.ContainsKey(feature)) { if (outputExtra) { sw.Write("\t{0}", namemap.Data[feature].Informations.Merge("\t")); } sw.Write("\t{0}", namemap.Data[feature].Value); } else { var fea = feature.StringBefore(":"); var suffix = feature.Contains(":") ? ":" + feature.StringAfter(":") : string.Empty; var feas = fea.Split('+'); var values = new List<string>(); var findFeature = feas.FirstOrDefault(m => namemap.Data.ContainsKey(m)); if (findFeature == null) { if (outputExtra) { sw.Write("\t{0}", (from f in namemap.InfoNames select string.Empty).Merge("\t")); } sw.Write("\t{0}", feature); } else { if (outputExtra) { for (int i = 0; i < namemap.InfoNames.Count; i++) { sw.Write("\t{0}", (from f in feas select namemap.Data.ContainsKey(f) ? namemap.Data[f].Informations[i] : string.Empty).Merge(";")); } } sw.Write("\t{0}", (from f in feas select namemap.Data.ContainsKey(f) ? namemap.Data[f].Value : f).Merge("+") + suffix); } } } foreach (var count in counts) { if (count.Data.ContainsKey(feature)) { sw.Write("\t" + count.Data[feature].Value); } else { sw.Write("\t" + missing); } } sw.WriteLine(); } } if (!_options.NoFPKM) { if (File.Exists(_options.MapFile)) { bool hasLength = false; using (var sr = new StreamReader(_options.MapFile)) { var line = sr.ReadLine(); if (line != null) { hasLength = line.Contains("length"); } } if (hasLength) { Progress.SetMessage("Calculating FPKM values..."); new HTSeqCountToFPKMCalculator(new HTSeqCountToFPKMCalculatorOptions() { InputFile = _options.OutputFile, GeneLengthFile = _options.MapFile, KeyRegex = _options.KeyRegex, OutputFile = Path.ChangeExtension(_options.OutputFile, ".fpkm.tsv") }) { Progress = this.Progress }.Process(); } } } return new[] { Path.GetFullPath(_options.OutputFile) }; }