public static List <SNPItem> ReadFromFile(string fileName) { var result = new List <SNPItem>(); using (var sr = new StreamReader(fileName)) { var line = sr.ReadLine(); var hasFlip = line.Contains("_AlleleNeedFlip"); var platforms = line.Split('\t').ToList().ConvertAll(m => m.StringBefore("_Allele")); var step = hasFlip ? 3 : 2; while ((line = sr.ReadLine()) != null) { var parts = line.Split('\t'); //sw.WriteLine("Dbsnp_ID\tCHROM\tPOS\tDbsnp_RefAllele\tDbsnp_AltAllele\tDbsnp_IsReversed\t1000G_REF\t1000G_ALT\t1000G_MAF\tDataset\t{0}", var item = new SNPItem() { Name = parts[0], Chrom = int.Parse(parts[1]), Position = int.Parse(parts[2]), Allele1 = parts[3][0], Allele2 = parts[4], DbsnpIsReversed = bool.Parse(parts[5]), G1000Allele1 = parts[6][0], G1000Allele2 = parts[7][0], G1000Allele2Frequency = double.Parse(parts[8]), Dataset = parts[9] }; for (int i = 10; i < parts.Length; i += step) { if (string.IsNullOrEmpty(parts[i])) { continue; } var alleles = parts[i].Split(':'); var aitem = new Alleles() { Allele1 = alleles[0], Allele2 = alleles[1] }; aitem.Allele2Frequency = double.Parse(parts[i + 1]); if (hasFlip) { aitem.NeedFlip = bool.Parse(parts[i + 2]); } item.Platforms[platforms[i]] = aitem; } result.Add(item); } } return(result); }
public override IEnumerable <string> Process() { var targetSNPs = SNPItem.ReadFromFile(_options.TargetSnpFile); using (var sw = new StreamWriter(_options.OutputFile)) using (var swInfo = new StreamWriter(_options.OutputFile + ".info")) { swInfo.WriteLine("MarkerId\tIsImputed"); foreach (var file in _options.InputFiles) { int chromosome = DetectChromosome(file); Progress.SetMessage("Chromosome {0} : {1}", chromosome, file); var locusMap = targetSNPs.Where(m => m.Chrom == chromosome).ToDictionary(m => m.Position.ToString()); using (var sr = new StreamReader(file)) { string line; SNPItem item; while ((line = sr.ReadLine()) != null) { string[] parts = line.Take(delimiter, 3); bool isImputed = parts[0].Equals("---"); if (isImputed) { if (locusMap.TryGetValue(parts[2], out item)) { var name = string.IsNullOrEmpty(item.Name) ? parts[1] : item.Name; var markerid = string.IsNullOrEmpty(item.Dataset) ? name : item.Dataset + ":" + name; sw.WriteLine("{0} {1}{2}", item.Chrom, markerid, line.StringAfter(parts[1])); swInfo.WriteLine("{0}\t{1}", markerid, isImputed); } } else { sw.WriteLine(line); swInfo.WriteLine("{0}\t{1}", parts[1], isImputed); } } } } } return(new[] { _options.OutputFile }); }
private static void DoFillAllele2FrequencyFrom1000Gome(this IEnumerable <SNPItem> snpItems, string g1000VcfFile, Func <SNPItem, string> keyFunc, IProgressCallback progress = null) { if (progress == null) { progress = new ConsoleProgressCallback(); } var dic = snpItems.ToDictionary(m => keyFunc(m)); progress.SetMessage("Filling MAF from {0} ...", g1000VcfFile); using (var sr = new StreamReader(g1000VcfFile)) { progress.SetRange(0, sr.BaseStream.Length); string line; while ((line = sr.ReadLine()) != null) { if (!line.StartsWith("##")) { break; } } int linecount = 0; while ((line = sr.ReadLine()) != null) { linecount++; if (linecount % 10000 == 0) { progress.SetPosition(sr.GetCharpos()); } var parts = line.Split('\t'); var snp = new SNPItem() { Chrom = HumanChromosomeToInt(parts[0]), Position = int.Parse(parts[1]), Name = parts[2] }; SNPItem loc; if (!dic.TryGetValue(keyFunc(snp), out loc)) { continue; } loc.G1000Allele1 = parts[3][0]; var allele2 = parts[4].Split(','); var frequencies = parts[7].StringAfter("AF=").StringBefore(";").Split(','); bool bFound = false; for (int i = 0; i < allele2.Length; i++) { if (allele2[i].Length != 1) { continue; } loc.G1000Allele2 = allele2[i][0]; if (loc.IsSourceAllelesMatchedWithG1000()) { loc.G1000Allele2Frequency = double.Parse(frequencies[i]); bFound = true; break; } } if (!bFound) { loc.G1000Allele1 = ' '; loc.G1000Allele2 = ' '; loc.G1000Allele2Frequency = 0.0; } } } progress.SetMessage("Filling MAF finished."); }
public static List<SNPItem> ReadFromFile(string fileName) { var result = new List<SNPItem>(); using (var sr = new StreamReader(fileName)) { var line = sr.ReadLine(); var hasFlip = line.Contains("_AlleleNeedFlip"); var platforms = line.Split('\t').ToList().ConvertAll(m => m.StringBefore("_Allele")); var step = hasFlip ? 3 : 2; while ((line = sr.ReadLine()) != null) { var parts = line.Split('\t'); //sw.WriteLine("Dbsnp_ID\tCHROM\tPOS\tDbsnp_RefAllele\tDbsnp_AltAllele\tDbsnp_IsReversed\t1000G_REF\t1000G_ALT\t1000G_MAF\tDataset\t{0}", var item = new SNPItem() { Name = parts[0], Chrom = int.Parse(parts[1]), Position = int.Parse(parts[2]), Allele1 = parts[3][0], Allele2 = parts[4], DbsnpIsReversed = bool.Parse(parts[5]), G1000Allele1 = parts[6][0], G1000Allele2 = parts[7][0], G1000Allele2Frequency = double.Parse(parts[8]), Dataset = parts[9] }; for (int i = 10; i < parts.Length; i += step) { if (string.IsNullOrEmpty(parts[i])) { continue; } var alleles = parts[i].Split(':'); var aitem = new Alleles() { Allele1 = alleles[0], Allele2 = alleles[1] }; aitem.Allele2Frequency = double.Parse(parts[i + 1]); if (hasFlip) { aitem.NeedFlip = bool.Parse(parts[i + 2]); } item.Platforms[platforms[i]] = aitem; } result.Add(item); } } return result; }
private static void DoFillAllele2FrequencyFrom1000Gome(this IEnumerable<SNPItem> snpItems, string g1000VcfFile, Func<SNPItem, string> keyFunc, IProgressCallback progress = null) { if (progress == null) { progress = new ConsoleProgressCallback(); } var dic = snpItems.ToDictionary(m => keyFunc(m)); progress.SetMessage("Filling MAF from {0} ...", g1000VcfFile); using (var sr = new StreamReader(g1000VcfFile)) { progress.SetRange(0, sr.BaseStream.Length); string line; while ((line = sr.ReadLine()) != null) { if (!line.StartsWith("##")) { break; } } int linecount = 0; while ((line = sr.ReadLine()) != null) { linecount++; if (linecount % 10000 == 0) { progress.SetPosition(sr.GetCharpos()); } var parts = line.Split('\t'); var snp = new SNPItem() { Chrom = HumanChromosomeToInt(parts[0]), Position = int.Parse(parts[1]), Name = parts[2] }; SNPItem loc; if (!dic.TryGetValue(keyFunc(snp), out loc)) { continue; } loc.G1000Allele1 = parts[3][0]; var allele2 = parts[4].Split(','); var frequencies = parts[7].StringAfter("AF=").StringBefore(";").Split(','); bool bFound = false; for (int i = 0; i < allele2.Length; i++) { if (allele2[i].Length != 1) { continue; } loc.G1000Allele2 = allele2[i][0]; if (loc.IsSourceAllelesMatchedWithG1000()) { loc.G1000Allele2Frequency = double.Parse(frequencies[i]); bFound = true; break; } } if (!bFound) { loc.G1000Allele1 = ' '; loc.G1000Allele2 = ' '; loc.G1000Allele2Frequency = 0.0; } } } progress.SetMessage("Filling MAF finished."); }