/// <summary> /// Fill reference allele from genome fasta file. /// </summary> /// <param name="snpItems"></param> /// <param name="fastaFile"></param> /// <param name="progress"></param> public static void FillReferenceAlleleFromFasta(this IEnumerable <SNPItem> snpItems, string fastaFile, IProgressCallback progress = null) { if (progress == null) { progress = new ConsoleProgressCallback(); } var dic = snpItems.ToGroupDictionary(m => m.Chrom); progress.SetMessage("Filling reference allele from {0} file ...", fastaFile); using (var sw = new StreamReader(fastaFile)) { var ff = new FastaFormat(); Sequence seq; while ((seq = ff.ReadSequence(sw)) != null) { progress.SetMessage("chromosome " + seq.Name + " ..."); var chr = HumanChromosomeToInt(seq.Name); if (dic.ContainsKey(chr)) { var snps = dic[chr]; foreach (var snp in snps) { snp.RefChar = char.ToUpper(seq.SeqString[snp.Position - 1]); } } } } progress.SetMessage("Filling reference allele finished."); }
private static void DoFillAllele2FrequencyFrom1000Gome(this IEnumerable <SNPItem> snpItems, string g1000VcfFile, Func <SNPItem, string> keyFunc, IProgressCallback progress = null) { if (progress == null) { progress = new ConsoleProgressCallback(); } var dic = snpItems.ToDictionary(m => keyFunc(m)); progress.SetMessage("Filling MAF from {0} ...", g1000VcfFile); using (var sr = new StreamReader(g1000VcfFile)) { progress.SetRange(0, sr.BaseStream.Length); string line; while ((line = sr.ReadLine()) != null) { if (!line.StartsWith("##")) { break; } } int linecount = 0; while ((line = sr.ReadLine()) != null) { linecount++; if (linecount % 10000 == 0) { progress.SetPosition(sr.GetCharpos()); } var parts = line.Split('\t'); var snp = new SNPItem() { Chrom = HumanChromosomeToInt(parts[0]), Position = int.Parse(parts[1]), Name = parts[2] }; SNPItem loc; if (!dic.TryGetValue(keyFunc(snp), out loc)) { continue; } loc.G1000Allele1 = parts[3][0]; var allele2 = parts[4].Split(','); var frequencies = parts[7].StringAfter("AF=").StringBefore(";").Split(','); bool bFound = false; for (int i = 0; i < allele2.Length; i++) { if (allele2[i].Length != 1) { continue; } loc.G1000Allele2 = allele2[i][0]; if (loc.IsSourceAllelesMatchedWithG1000()) { loc.G1000Allele2Frequency = double.Parse(frequencies[i]); bFound = true; break; } } if (!bFound) { loc.G1000Allele1 = ' '; loc.G1000Allele2 = ' '; loc.G1000Allele2Frequency = 0.0; } } } progress.SetMessage("Filling MAF finished."); }