public override IEnumerable <string> Process() { var files = _options.GetAnnovarFiles(); var filelist = files.Keys.ToArray(); using (var sw = new StreamWriter(_options.OutputFile)) { //deal with comments using (var sr = new StreamReader(filelist[0])) { string line; while ((line = sr.ReadLine()) != null) { if (line.StartsWith("##MuTect=")) { sw.WriteLine(line); for (var i = 1; i < filelist.Length; i++) { using (var sr2 = new StreamReader(filelist[i])) { while ((line = sr2.ReadLine()) != null) { if (!line.StartsWith("##MuTect=")) { continue; } sw.WriteLine(line); break; } } } } else if (!line.StartsWith("#")) { break; } else { sw.WriteLine(line); } } } //deal with data var data = new List <FileData>(); foreach (var file in filelist) { var lines = File.ReadAllLines(file); var mutect = lines.FirstOrDefault(m => m.StartsWith("##MuTect=")); string normal, tumor, normalName, tumorName; if (mutect != null) { normal = mutect.StringAfter("normal_sample_name=").StringBefore(" "); tumor = mutect.StringAfter("tumor_sample_name=").StringBefore(" "); normalName = normal; tumorName = tumor; } else { normal = "NORMAL"; tumor = "TUMOR"; normalName = Path.GetFileName(file).StringBefore(".") + "_normal"; tumorName = Path.GetFileName(file).StringBefore(".") + "_tumor"; } var header = lines.First(m => !m.StartsWith("#")); var headers = header.Split('\t'); var infoIndex = Array.IndexOf(headers, "INFO"); var formatIndex = Array.IndexOf(headers, "FORMAT"); var normalIndex = Array.IndexOf(headers, normal); var tumorIndex = Array.IndexOf(headers, tumor); var dictionary = new Dictionary <string, FileDataValue>(); foreach (var line in lines) { if (string.IsNullOrWhiteSpace(line) || line.StartsWith("#") || line.StartsWith("Chr")) { continue; } var parts = line.Split('\t'); if (parts.Length != headers.Length) { continue; } var vnormal = GetAllele(parts, normalIndex); var vtumor = GetAllele(parts, tumorIndex); var value = new FileDataValue() { Key = parts[0] + "_" + parts[1], Parts = parts, VNormal = vnormal, VTumor = vtumor }; dictionary.Add(value.Key, value); } data.Add(new FileData() { File = file, Normal = normalName, Tumor = tumorName, Headers = headers, InfoIndex = infoIndex, FormatIndex = formatIndex, NormalIndex = normalIndex, TumorIndex = tumorIndex, Data = dictionary }); } //get all positions var keys = (from d in data from k in d.Data.Keys select k).Distinct().ToList().ConvertAll(m => { var p = m.Split('_'); return(new { Key = m, Chr = p[0], Position = int.Parse(p[1]) }); } ); GenomeUtils.SortChromosome(keys, m => m.Chr, m => m.Position); var keyMap = keys.ToDictionary(m => m.Key); //check by original vcf file to fill the other columns foreach (var d in data) { var vcf = files[d.File]; if (string.IsNullOrEmpty(vcf)) { continue; } var vd = d.Data; using (var sr = new StreamReader(vcf)) { string line; var normalIndex = -1; var tumorIndex = -1; while ((line = sr.ReadLine()) != null) { if (!line.StartsWith("#CHROM")) { continue; } var parts = line.Split('\t'); normalIndex = Array.IndexOf(parts, d.Normal); tumorIndex = Array.IndexOf(parts, d.Tumor); break; } if (normalIndex == -1) { throw new Exception(string.Format("Normal {0} is not included in detail vcf file {1} but in annovar result {1}", d.Normal, vcf, d.File)); } if (tumorIndex == -1) { throw new Exception(string.Format("Tumor {0} is not included in detail vcf file {1} but in annovar result {1}", d.Tumor, vcf, d.File)); } var minIndex = Math.Max(normalIndex, tumorIndex) + 1; while ((line = sr.ReadLine()) != null) { var parts = line.Split('\t'); if (parts.Length < minIndex) { break; } var key = parts[0] + "_" + parts[1]; if (!keyMap.ContainsKey(key)) { continue; } FileDataValue fdv; if (!vd.TryGetValue(key, out fdv)) { fdv = new FileDataValue() { Key = key, Parts = null }; vd[key] = fdv; } fdv.VNormal = GetAllele(parts, normalIndex); fdv.VTumor = GetAllele(parts, tumorIndex); } } } //write header for (var i = 0; i < data[0].Headers.Length; i++) { if (i == data[0].NormalIndex || i == data[0].TumorIndex || i == data[0].InfoIndex || i == data[0].FormatIndex) { continue; } else { if (i != 0) { sw.Write("\t"); } sw.Write(data[0].Headers[i]); } } var normalnames = (from d in data select d.Normal).Distinct().ToArray(); sw.Write("\t{0}", normalnames.Merge('\t')); var tumornames = (from d in data select d.Tumor).Distinct().ToArray(); sw.WriteLine("\t{0}", tumornames.Merge('\t')); foreach (var key in keys) { var d1 = data.First(d => d.Data.ContainsKey(key.Key) && d.Data[key.Key].Parts != null); var v1 = d1.Data[key.Key]; for (var i = 0; i < v1.Parts.Length; i++) { if (i == 0) { sw.Write("{0}", v1.Parts[0]); } else if (i == d1.InfoIndex || i == d1.FormatIndex || i == d1.NormalIndex || i == d1.TumorIndex) { continue; } else { sw.Write("\t{0}", v1.Parts[i]); } } foreach (var name in normalnames) { var dn = (from d in data where d.Normal.Equals(name) && d.Data.ContainsKey(key.Key) select d).FirstOrDefault(); if (dn == null) { sw.Write("\t"); } else { var vn = dn.Data[key.Key].VNormal; sw.Write("\t{0}", vn); } } foreach (var name in tumornames) { var dn = (from d in data where d.Tumor.Equals(name) && d.Data.ContainsKey(key.Key) select d).FirstOrDefault(); if (dn == null) { sw.Write("\t"); } else { var vn = dn.Data[key.Key].VTumor; sw.Write("\t{0}", vn); } } sw.WriteLine(); } } return(new[] { _options.OutputFile }); }
public override IEnumerable<string> Process() { var files = _options.GetAnnovarFiles(); var filelist = files.Keys.ToArray(); using (var sw = new StreamWriter(_options.OutputFile)) { //deal with comments using (var sr = new StreamReader(filelist[0])) { string line; while ((line = sr.ReadLine()) != null) { if (line.StartsWith("##MuTect=")) { sw.WriteLine(line); for (var i = 1; i < filelist.Length; i++) { using (var sr2 = new StreamReader(filelist[i])) { while ((line = sr2.ReadLine()) != null) { if (!line.StartsWith("##MuTect=")) continue; sw.WriteLine(line); break; } } } } else if (!line.StartsWith("#")) { break; } else { sw.WriteLine(line); } } } //deal with data var data = new List<FileData>(); foreach (var file in filelist) { var lines = File.ReadAllLines(file); var mutect = lines.FirstOrDefault(m => m.StartsWith("##MuTect=")); string normal, tumor, normalName, tumorName; if (mutect != null) { normal = mutect.StringAfter("normal_sample_name=").StringBefore(" "); tumor = mutect.StringAfter("tumor_sample_name=").StringBefore(" "); normalName = normal; tumorName = tumor; } else { normal = "NORMAL"; tumor = "TUMOR"; normalName = Path.GetFileName(file).StringBefore(".") + "_normal"; tumorName = Path.GetFileName(file).StringBefore(".") + "_tumor"; } var header = lines.First(m => !m.StartsWith("#")); var headers = header.Split('\t'); var infoIndex = Array.IndexOf(headers, "INFO"); var formatIndex = Array.IndexOf(headers, "FORMAT"); var normalIndex = Array.IndexOf(headers, normal); var tumorIndex = Array.IndexOf(headers, tumor); var dictionary = new Dictionary<string, FileDataValue>(); foreach (var line in lines) { if (string.IsNullOrWhiteSpace(line) || line.StartsWith("#") || line.StartsWith("Chr")) continue; var parts = line.Split('\t'); if (parts.Length != headers.Length) continue; var vnormal = GetAllele(parts, normalIndex); var vtumor = GetAllele(parts, tumorIndex); var value = new FileDataValue() { Key = parts[0] + "_" + parts[1], Parts = parts, VNormal = vnormal, VTumor = vtumor }; dictionary.Add(value.Key, value); } data.Add(new FileData() { File = file, Normal = normalName, Tumor = tumorName, Headers = headers, InfoIndex = infoIndex, FormatIndex = formatIndex, NormalIndex = normalIndex, TumorIndex = tumorIndex, Data = dictionary }); } //get all positions var keys = (from d in data from k in d.Data.Keys select k).Distinct().ToList().ConvertAll(m => { var p = m.Split('_'); return new { Key = m, Chr = p[0], Position = int.Parse(p[1]) }; } ); GenomeUtils.SortChromosome(keys, m => m.Chr, m => m.Position); var keyMap = keys.ToDictionary(m => m.Key); //check by original vcf file to fill the other columns foreach (var d in data) { var vcf = files[d.File]; if (string.IsNullOrEmpty(vcf)) continue; var vd = d.Data; using (var sr = new StreamReader(vcf)) { string line; var normalIndex = -1; var tumorIndex = -1; while ((line = sr.ReadLine()) != null) { if (!line.StartsWith("#CHROM")) continue; var parts = line.Split('\t'); normalIndex = Array.IndexOf(parts, d.Normal); tumorIndex = Array.IndexOf(parts, d.Tumor); break; } if (normalIndex == -1) { throw new Exception(string.Format("Normal {0} is not included in detail vcf file {1} but in annovar result {1}", d.Normal, vcf, d.File)); } if (tumorIndex == -1) { throw new Exception(string.Format("Tumor {0} is not included in detail vcf file {1} but in annovar result {1}", d.Tumor, vcf, d.File)); } var minIndex = Math.Max(normalIndex, tumorIndex) + 1; while ((line = sr.ReadLine()) != null) { var parts = line.Split('\t'); if (parts.Length < minIndex) { break; } var key = parts[0] + "_" + parts[1]; if (!keyMap.ContainsKey(key)) continue; FileDataValue fdv; if(!vd.TryGetValue(key, out fdv)) { fdv = new FileDataValue() { Key = key, Parts = null }; vd[key] = fdv; } fdv.VNormal = GetAllele(parts, normalIndex); fdv.VTumor = GetAllele(parts, tumorIndex); } } } //write header for (var i = 0; i < data[0].Headers.Length; i++) { if (i == data[0].NormalIndex || i == data[0].TumorIndex || i == data[0].InfoIndex || i == data[0].FormatIndex) { continue; } else { if (i != 0) { sw.Write("\t"); } sw.Write(data[0].Headers[i]); } } var normalnames = (from d in data select d.Normal).Distinct().ToArray(); sw.Write("\t{0}", normalnames.Merge('\t')); var tumornames = (from d in data select d.Tumor).Distinct().ToArray(); sw.WriteLine("\t{0}", tumornames.Merge('\t')); foreach (var key in keys) { var d1 = data.First(d => d.Data.ContainsKey(key.Key) && d.Data[key.Key].Parts != null); var v1 = d1.Data[key.Key]; for (var i = 0; i < v1.Parts.Length; i++) { if (i == 0) { sw.Write("{0}", v1.Parts[0]); } else if (i == d1.InfoIndex || i == d1.FormatIndex || i == d1.NormalIndex || i == d1.TumorIndex) { continue; } else { sw.Write("\t{0}", v1.Parts[i]); } } foreach (var name in normalnames) { var dn = (from d in data where d.Normal.Equals(name) && d.Data.ContainsKey(key.Key) select d).FirstOrDefault(); if (dn == null) { sw.Write("\t"); } else { var vn = dn.Data[key.Key].VNormal; sw.Write("\t{0}", vn); } } foreach (var name in tumornames) { var dn = (from d in data where d.Tumor.Equals(name) && d.Data.ContainsKey(key.Key) select d).FirstOrDefault(); if (dn == null) { sw.Write("\t"); } else { var vn = dn.Data[key.Key].VTumor; sw.Write("\t{0}", vn); } } sw.WriteLine(); } } return new[] { _options.OutputFile }; }