public override IEnumerable <string> Process() { var countfiles = _options.GetCountFiles(); var reader = new MapItemReader(_options.KeyIndex, _options.ValueIndex, informationIndex: _options.InformationIndex, hasHeader: !_options.HasNoHeader); reader.CheckEnd = m => m.StartsWith("__no_feature") || m.StartsWith("no_feature"); var counts = new List <SampleData>(); foreach (var file in countfiles) { Progress.SetMessage("Reading data from {0} ...", file.File); var data = reader.ReadFromFile(file.File); if (!string.IsNullOrEmpty(_options.KeyRegex)) { var reg = new Regex(_options.KeyRegex); counts.Add(new SampleData() { Name = file.Name, Data = data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value) }); } else { counts.Add(new SampleData() { Name = file.Name, Data = data }); } } MapData namemap = null; if (File.Exists(_options.MapFile)) { Progress.SetMessage("Reading name map from {0} ...", _options.MapFile); namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile); if (!string.IsNullOrEmpty(_options.KeyRegex)) { var reg = new Regex(_options.KeyRegex); namemap.Data = namemap.Data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value); } } if (!string.IsNullOrEmpty(_options.KeyRegex)) { Progress.SetMessage("Filtering key by pattern {0} ...", _options.KeyRegex); var reg = new Regex(_options.KeyRegex); counts.ForEach(m => { var keys = m.Data.Keys.ToList(); foreach (var key in keys) { if (!reg.Match(key).Success) { m.Data.Remove(key); } } }); } var features = (from c in counts from k in c.Data.Keys select k).Distinct().OrderBy(m => m).ToList(); var missing = _options.FillMissingWithZero ? "0" : "NA"; var outputExtra = _options.ExportExtra && namemap != null && namemap.InfoNames.Count > 0; Progress.SetMessage("Writing {0} features to {1} ...", features.Count, _options.OutputFile); using (var sw = new StreamWriter(_options.OutputFile)) { sw.Write("Feature"); if (namemap != null) { if (outputExtra) { sw.Write("\t{0}", (from v in namemap.InfoNames select "Feature_" + v).Merge("\t")); } sw.Write("\tFeature_{0}", namemap.ValueName); } sw.WriteLine("\t" + (from c in counts select c.Name).Merge("\t")); foreach (var feature in features) { if ((from count in counts where count.Data.ContainsKey(feature) select count.Data[feature]).All(m => string.IsNullOrEmpty(m.Value) || m.Value.Equals("0"))) { continue; } sw.Write(feature); if (namemap != null) { var feature2 = feature.StringBefore("."); if (namemap.Data.ContainsKey(feature)) { if (outputExtra) { sw.Write("\t{0}", namemap.Data[feature].Informations.Merge("\t")); } sw.Write("\t{0}", namemap.Data[feature].Value); } else { var fea = feature.StringBefore(":"); var suffix = feature.Contains(":") ? ":" + feature.StringAfter(":") : string.Empty; var feas = fea.Split('+'); var values = new List <string>(); var findFeature = feas.FirstOrDefault(m => namemap.Data.ContainsKey(m)); if (findFeature == null) { if (outputExtra) { sw.Write("\t{0}", (from f in namemap.InfoNames select string.Empty).Merge("\t")); } sw.Write("\t{0}", feature); } else { if (outputExtra) { for (int i = 0; i < namemap.InfoNames.Count; i++) { sw.Write("\t{0}", (from f in feas select namemap.Data.ContainsKey(f) ? namemap.Data[f].Informations[i] : string.Empty).Merge(";")); } } sw.Write("\t{0}", (from f in feas select namemap.Data.ContainsKey(f) ? namemap.Data[f].Value : f).Merge("+") + suffix); } } } foreach (var count in counts) { if (count.Data.ContainsKey(feature)) { sw.Write("\t" + count.Data[feature].Value); } else { sw.Write("\t" + missing); } } sw.WriteLine(); } } //output proteincoding count table bool hasProteinCoding = namemap != null && namemap.InfoNames.Count > 0 && namemap.InfoNames.Contains("gene_biotype"); if (hasProteinCoding) { WriteProteincodingFile(_options.OutputFile, ".count"); } if (!_options.NoFPKM) { bool hasLength = namemap != null && namemap.InfoNames.Count > 0 && namemap.InfoNames.Contains("length"); if (hasLength) { Progress.SetMessage("Calculating FPKM values..."); var outputFile = Path.ChangeExtension(_options.OutputFile, ".fpkm.tsv"); new HTSeqCountToFPKMCalculator(new HTSeqCountToFPKMCalculatorOptions() { InputFile = _options.OutputFile, GeneLengthFile = _options.MapFile, KeyRegex = _options.KeyRegex, OutputFile = outputFile }) { Progress = this.Progress }.Process(); if (hasProteinCoding) { WriteProteincodingFile(outputFile, ".tsv"); } } } return(new[] { Path.GetFullPath(_options.OutputFile) }); }
public override IEnumerable<string> Process() { var countfiles = _options.GetCountFiles(); var reader = new MapItemReader(_options.KeyIndex, _options.ValueIndex, informationIndex: _options.InformationIndex, hasHeader: !_options.HasNoHeader); reader.CheckEnd = m => m.StartsWith("__no_feature") || m.StartsWith("no_feature"); var counts = new List<SampleData>(); foreach (var file in countfiles) { Progress.SetMessage("Reading data from {0} ...", file.File); var data = reader.ReadFromFile(file.File); if (!string.IsNullOrEmpty(_options.KeyRegex)) { var reg = new Regex(_options.KeyRegex); counts.Add(new SampleData() { Name = file.Name, Data = data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value) }); } else { counts.Add(new SampleData() { Name = file.Name, Data = data }); } } MapData namemap = null; if (File.Exists(_options.MapFile)) { Progress.SetMessage("Reading name map from {0} ...", _options.MapFile); namemap = new MapDataReader(0, 1).ReadFromFile(_options.MapFile); if (!string.IsNullOrEmpty(_options.KeyRegex)) { var reg = new Regex(_options.KeyRegex); namemap.Data = namemap.Data.ToDictionary(l => reg.Match(l.Key).Groups[1].Value, l => l.Value); } } if (!string.IsNullOrEmpty(_options.KeyRegex)) { Progress.SetMessage("Filtering key by pattern {0} ...", _options.KeyRegex); var reg = new Regex(_options.KeyRegex); counts.ForEach(m => { var keys = m.Data.Keys.ToList(); foreach (var key in keys) { if (!reg.Match(key).Success) { m.Data.Remove(key); } } }); } var features = (from c in counts from k in c.Data.Keys select k).Distinct().OrderBy(m => m).ToList(); var missing = _options.FillMissingWithZero ? "0" : "NA"; var outputExtra = _options.ExportExtra && namemap != null && namemap.InfoNames.Count > 0; Progress.SetMessage("Writing {0} features to {1} ...", features.Count, _options.OutputFile); using (var sw = new StreamWriter(_options.OutputFile)) { sw.Write("Feature"); if (namemap != null) { if (outputExtra) { sw.Write("\t{0}", (from v in namemap.InfoNames select "Feature_" + v).Merge("\t")); } sw.Write("\tFeature_{0}", namemap.ValueName); } sw.WriteLine("\t" + (from c in counts select c.Name).Merge("\t")); foreach (var feature in features) { if ((from count in counts where count.Data.ContainsKey(feature) select count.Data[feature]).All(m => string.IsNullOrEmpty(m.Value) || m.Value.Equals("0"))) { continue; } sw.Write(feature); if (namemap != null) { var feature2 = feature.StringBefore("."); if (namemap.Data.ContainsKey(feature)) { if (outputExtra) { sw.Write("\t{0}", namemap.Data[feature].Informations.Merge("\t")); } sw.Write("\t{0}", namemap.Data[feature].Value); } else { var fea = feature.StringBefore(":"); var suffix = feature.Contains(":") ? ":" + feature.StringAfter(":") : string.Empty; var feas = fea.Split('+'); var values = new List<string>(); var findFeature = feas.FirstOrDefault(m => namemap.Data.ContainsKey(m)); if (findFeature == null) { if (outputExtra) { sw.Write("\t{0}", (from f in namemap.InfoNames select string.Empty).Merge("\t")); } sw.Write("\t{0}", feature); } else { if (outputExtra) { for (int i = 0; i < namemap.InfoNames.Count; i++) { sw.Write("\t{0}", (from f in feas select namemap.Data.ContainsKey(f) ? namemap.Data[f].Informations[i] : string.Empty).Merge(";")); } } sw.Write("\t{0}", (from f in feas select namemap.Data.ContainsKey(f) ? namemap.Data[f].Value : f).Merge("+") + suffix); } } } foreach (var count in counts) { if (count.Data.ContainsKey(feature)) { sw.Write("\t" + count.Data[feature].Value); } else { sw.Write("\t" + missing); } } sw.WriteLine(); } } if (!_options.NoFPKM) { if (File.Exists(_options.MapFile)) { bool hasLength = false; using (var sr = new StreamReader(_options.MapFile)) { var line = sr.ReadLine(); if (line != null) { hasLength = line.Contains("length"); } } if (hasLength) { Progress.SetMessage("Calculating FPKM values..."); new HTSeqCountToFPKMCalculator(new HTSeqCountToFPKMCalculatorOptions() { InputFile = _options.OutputFile, GeneLengthFile = _options.MapFile, KeyRegex = _options.KeyRegex, OutputFile = Path.ChangeExtension(_options.OutputFile, ".fpkm.tsv") }) { Progress = this.Progress }.Process(); } } } return new[] { Path.GetFullPath(_options.OutputFile) }; }