public override IEnumerable <string> Process() { using (var file = new PlinkBedRandomFile(_options.InputFile) { Progress = this.Progress }) { var locusList = file.Data.Locus; var individualList = file.Data.Individual; Progress.SetRange(0, locusList.Count); for (int i = 0; i < locusList.Count; i++) { Progress.SetPosition(i); var locus = locusList[i]; var data = file.Read(locus.MarkerId); int count1 = 0; int count2 = 0; int validSample = 0; for (int j = 0; j < individualList.Count; j++) { if (PlinkData.IsMissing(data[0, j], data[1, j])) { continue; } validSample++; if (data[0, j]) { count2++; } else { count1++; } if (data[1, j]) { count2++; } else { count1++; } } locus.Allele1Frequency = ((double)(count1)) / (count1 + count2); locus.TotalSample = individualList.Count; locus.ValidSample = validSample; } PlinkLocus.WriteToFile(_options.OutputFile, locusList, false, true, true); } return(new string[] { _options.OutputFile }); }
public override IEnumerable <string> Process() { Progress.SetMessage("Reading data from " + _options.InputFile + "..."); var data = _options.GetFileReader().ReadFromFile(_options.InputFile); var locusList = data.Locus; var individualList = data.Individual; for (int i = 0; i < locusList.Count; i++) { var locus = locusList[i]; int count1 = 0; int count2 = 0; int validSample = 0; for (int j = 0; j < individualList.Count; j++) { if (data.IsMissing(i, j)) { continue; } validSample++; if (data.IsHaplotype1Allele2[i, j]) { count2++; } else { count1++; } if (data.IsHaplotype2Allele2[i, j]) { count2++; } else { count1++; } } locus.Allele1Frequency = ((double)(count2)) / (count1 + count2); locus.TotalSample = individualList.Count; locus.ValidSample = validSample; } PlinkLocus.WriteToFile(_options.OutputFile, locusList, false, true); return(new string[] { _options.OutputFile }); }
public override IEnumerable<string> Process() { var locusList = new List<PlinkLocus>(); using (var sr = new StreamReader(_options.InputFile)) { string line; while ((line = sr.ReadLine()) != null) { var parts = line.Split(' '); var locus = new PlinkLocus() { Chromosome = int.Parse(parts[0]), MarkerId = parts[1], PhysicalPosition = int.Parse(parts[2]), Allele1 = parts[3], Allele2 = parts[4] }; locusList.Add(locus); var count1 = 0; var count2 = 0; for (int i = 5; i < parts.Length; i += 3) { if (parts[i].Equals("1")) { count1 += 2; } else if (parts[i + 1].Equals("1")) { count1++; count2++; } else if (parts[i + 2].Equals("1")) { count2 += 2; } else {//unknown, ignore Console.Error.WriteLine(string.Format("Unknown, name={0}, i={1}, genotype={2} {3} {4}", locus.MarkerId, i, parts[i], parts[i + 1], parts[i + 2])); } } locus.Allele1Frequency = ((double)(count2)) / (count1 + count2); } } PlinkLocus.WriteToFile(_options.OutputFile, locusList, false, true); return new string[] { _options.OutputFile }; }
public override IEnumerable <string> Process() { var locusList = new List <PlinkLocus>(); using (var sr = new StreamReader(_options.InputFile)) { string line; while ((line = sr.ReadLine()) != null) { var parts = line.Split(' '); var locus = new PlinkLocus() { Chromosome = int.Parse(parts[0]), MarkerId = parts[1], PhysicalPosition = int.Parse(parts[2]), Allele1 = parts[3], Allele2 = parts[4] }; locusList.Add(locus); var count1 = 0; var count2 = 0; for (int i = 5; i < parts.Length; i += 3) { if (parts[i].Equals("1")) { count1 += 2; } else if (parts[i + 1].Equals("1")) { count1++; count2++; } else if (parts[i + 2].Equals("1")) { count2 += 2; } else {//unknown, ignore Console.Error.WriteLine(string.Format("Unknown, name={0}, i={1}, genotype={2} {3} {4}", locus.MarkerId, i, parts[i], parts[i + 1], parts[i + 2])); } } locus.Allele1Frequency = ((double)(count2)) / (count1 + count2); } } PlinkLocus.WriteToFile(_options.OutputFile, locusList, false, true); return(new string[] { _options.OutputFile }); }
/// <summary> /// Read locus from bim file of bed format /// </summary> /// <param name="fileName">bim file</param> /// <returns>list of PlinkLocus</returns> public static List <PlinkLocus> ReadFromBimFile(string fileName, bool hasPlatform = false, bool hasAllele2Freqency = false) { var result = new List <PlinkLocus>(); using (var sr = new StreamReader(fileName)) { string line; while ((line = sr.ReadLine()) != null) { line = line.Trim(); if (string.IsNullOrEmpty(line)) { continue; } var parts = line.Split('\t'); if (string.IsNullOrEmpty(parts[1])) { continue; } var locus = new PlinkLocus(); locus.Chromosome = int.Parse(parts[0]); locus.MarkerId = parts[1]; locus.GeneticDistance = double.Parse(parts[2]); locus.PhysicalPosition = int.Parse(parts[3]); locus.Allele1 = parts[4]; locus.Allele2 = parts[5]; var index = 6; if (hasPlatform) { locus.Platform = parts[index++]; locus.ValidPlatformCount = int.Parse(parts[index++]); } if (hasAllele2Freqency) { locus.Allele1Frequency = double.Parse(parts[index++]); locus.TotalSample = int.Parse(parts[index++]); locus.ValidSample = int.Parse(parts[index++]); } result.Add(locus); } } return(result); }
/// <summary> /// Read locus from bim file of bed format /// </summary> /// <param name="fileName">bim file</param> /// <returns>list of PlinkLocus</returns> public static List<PlinkLocus> ReadFromBimFile(string fileName, bool hasPlatform = false, bool hasAllele2Freqency = false) { var result = new List<PlinkLocus>(); using (var sr = new StreamReader(fileName)) { string line; while ((line = sr.ReadLine()) != null) { line = line.Trim(); if (string.IsNullOrEmpty(line)) { continue; } var parts = line.Split('\t'); if (string.IsNullOrEmpty(parts[1])) { continue; } var locus = new PlinkLocus(); locus.Chromosome = int.Parse(parts[0]); locus.MarkerId = parts[1]; locus.GeneticDistance = double.Parse(parts[2]); locus.PhysicalPosition = int.Parse(parts[3]); locus.Allele1 = parts[4]; locus.Allele2 = parts[5]; var index = 6; if (hasPlatform) { locus.Platform = parts[index++]; locus.ValidPlatformCount = int.Parse(parts[index++]); } if (hasAllele2Freqency) { locus.Allele1Frequency = double.Parse(parts[index++]); locus.TotalSample = int.Parse(parts[index++]); locus.ValidSample = int.Parse(parts[index++]); } result.Add(locus); } } return result; }
/// <summary> /// Read locus from map file of ped format /// </summary> /// <param name="fileName">map file</param> /// <returns>list of PlinkLocus</returns> public static List <PlinkLocus> ReadFromMapFile(string fileName) { var result = new List <PlinkLocus>(); using (var sr = new StreamReader(fileName)) { string line; while ((line = sr.ReadLine()) != null) { line = line.Trim(); if (string.IsNullOrEmpty(line)) { continue; } var parts = line.Split('\t'); if (string.IsNullOrEmpty(parts[1])) { continue; } var locus = new PlinkLocus(); locus.Chromosome = int.Parse(parts[0]); locus.MarkerId = parts[1]; locus.GeneticDistance = int.Parse(parts[2]); locus.PhysicalPosition = int.Parse(parts[3]); if (parts.Length >= 6) { locus.Allele1 = parts[4]; locus.Allele2 = parts[5]; } else { locus.Allele1 = MISSING; locus.Allele2 = MISSING; } result.Add(locus); } } return(result); }
private static PlinkData ReadLocus(string fileName) { var result = new PlinkData(); var tmapFile = FileUtils.ChangeExtension(fileName, ".tmap"); if (File.Exists(tmapFile)) { result.Locus = PlinkLocus.ReadFromMapFile(tmapFile); return(result); } var mapFile = FileUtils.ChangeExtension(fileName, ".map"); if (File.Exists(mapFile)) { result.Locus = PlinkLocus.ReadFromMapFile(mapFile); return(result); } throw new FileNotFoundException("File not found: " + mapFile); }
private static bool IsIndel(PlinkLocus m) { return m.Allele1.Length != 1 || m.Allele2.Length != 1 || m.Allele1.Equals("I") || m.Allele1.Equals("D") || m.Allele2.Equals("I") || m.Allele2.Equals("D"); }
private static bool IsMissing(PlinkLocus m) { return m.Allele1.Equals("0") && m.Allele2.Equals("0"); }
public override IEnumerable <string> Process() { var result = new List <string>(); var bimfile = Path.ChangeExtension(options.InputFile, ".bim"); var snps = PlinkLocus.ReadFromBimFile(bimfile, false, false); snps.RemoveAll(m => IsIndel(m) || IsMissing(m)); var snpItems = (from snp in snps select new SNPItem() { Chrom = snp.Chromosome, Name = snp.MarkerId, Position = snp.PhysicalPosition, Allele1 = snp.Allele1[0], Allele2 = snp.Allele2 }).ToList(); var nameMap = snpItems.FillDbsnpIdByPosition(options.DbsnpFile, this.Progress); using (var sw = new StreamWriter(options.OutputPrefix + ".namemap")) { sw.WriteLine("NewName\tOldName"); foreach (var n in nameMap) { sw.WriteLine("{0}\t{1}", n.Key, n.Value); } } //remove all snps without corresponding dbsnp entry snpItems.RemoveAll(m => m.DbsnpRefAllele == ' '); var nameDic = snpItems.ToGroupDictionary(m => m.Name); foreach (var n in nameDic) { if (n.Value.Count > 1) { Console.Error.WriteLine("Duplicated SNP:" + n.Key); foreach (var v in n.Value) { Console.Error.WriteLine("{0}:{1}-{2}:{3},{4}:{5},{6}", n.Key, v.Chrom, v.Position, v.Allele1, v.Allele2, v.DbsnpRefAllele, v.DbsnpAltAllele); } } } if (File.Exists(options.G1000File)) { snpItems.FindAllele2FrequencyFrom1000GomeByName(options.G1000File, this.Progress); } if (File.Exists(options.FastaFile)) { snpItems.FillReferenceAlleleFromFasta(options.FastaFile, this.Progress); } Dictionary <string, StrandAction> actionMap = new Dictionary <string, StrandAction>(); var statFile = options.OutputPrefix + ".stat"; result.Add(statFile); using (var sw = new StreamWriter(statFile)) { sw.WriteLine("Name\tChromosome\tPosition\tSource_Allele1\tSource_Allele2\tReference_Allele\tDbsnp_RefAllele\tDbsnp_AltAllele\tDbsnp_IsReversed\tG1000_RefAllele\tG1000_AltAllele\tG1000_MAF\tAction"); foreach (var v in snpItems) { StrandAction action = v.SuggestAction(); sw.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11:0.####}\t{12}", v.Name, v.Chrom, v.Position, v.Allele1, v.Allele2, v.RefChar, v.DbsnpRefAllele, v.DbsnpAltAllele, v.DbsnpIsReversed, v.G1000Allele1, v.G1000Allele2, v.G1000Allele2Frequency, action); actionMap[v.Name] = action; } } using (var reader = new PlinkBedRandomFile(options.InputFile) { Progress = this.Progress }) { var data = reader.Data; var chrs = (from v in snpItems select v.Chrom).Distinct().OrderBy(m => m).ToArray(); foreach (var chr in chrs) { var genfile = string.Format("{0}.{1}.gen", options.OutputPrefix, chr.ToString().PadLeft(2, '0')); result.Add(genfile); var map = FileUtils.ChangeExtension(genfile, ".sample"); new GwasSampleFormat().WriteToFile(map, data.Individual); //save gen file using (var sw = new StreamWriter(genfile)) { sw.NewLine = Environment.NewLine; var chrItems = snpItems.Where(m => m.Chrom == chr).ToList(); GenomeUtils.SortChromosome(chrItems, m => chr.ToString(), m => m.Position); foreach (var snp in chrItems) { var ldata = reader.Read(nameMap[snp.Name]); var action = actionMap[snp.Name]; sw.Write("{0} {1} {2} {3} {4}", snp.Chrom, snp.Name, snp.Position, snp.DbsnpRefAllele, snp.DbsnpAltAllele); for (int individualIndex = 0; individualIndex < data.Individual.Count; individualIndex++) { if (PlinkData.IsMissing(ldata[0, individualIndex], ldata[1, individualIndex])) { sw.Write(" 0 0 0"); } else { char alle1, alle2; if (StrandAction.Switch == action || StrandAction.FlipSwitch == action) { alle1 = ldata[0, individualIndex] ? snp.DbsnpAltAllele : snp.DbsnpRefAllele; alle2 = ldata[1, individualIndex] ? snp.DbsnpAltAllele : snp.DbsnpRefAllele; } else { alle1 = ldata[0, individualIndex] ? snp.DbsnpRefAllele : snp.DbsnpAltAllele; alle2 = ldata[1, individualIndex] ? snp.DbsnpRefAllele : snp.DbsnpAltAllele; } if (alle1 != alle2) { sw.Write(" 0 1 0"); } else if (alle1 == snp.DbsnpRefAllele) { sw.Write(" 1 0 0"); } else { sw.Write(" 0 0 1"); } } } sw.WriteLine(); } } } } return(result); }
private static bool IsIndel(PlinkLocus m) { return(m.Allele1.Length != 1 || m.Allele2.Length != 1 || m.Allele1.Equals("I") || m.Allele1.Equals("D") || m.Allele2.Equals("I") || m.Allele2.Equals("D")); }
private static bool IsMissing(PlinkLocus m) { return(m.Allele1.Equals("0") && m.Allele2.Equals("0")); }
public PlinkData ReadFromFile(string fileName) { var famFile = FileUtils.ChangeExtension(fileName, ".fam"); if (!File.Exists(famFile)) { throw new FileNotFoundException("File not found: " + famFile); } var bimFile = FileUtils.ChangeExtension(fileName, ".bim"); if (!File.Exists(bimFile)) { throw new FileNotFoundException("File not found: " + bimFile); } var result = new PlinkData(); result.Individual = PlinkIndividual.ReadFromFile(famFile); result.Locus = PlinkLocus.ReadFromBimFile(bimFile); result.AllocateDataMemory(); OpenBinaryFile(fileName); try { if (IsSNPMajor) { for (int i = 0; i < result.Locus.Count; i++) { int j = 0; while (j < result.Individual.Count) { var b = ReadByte(); int c = 0; while (c < 7 && j < result.Individual.Count) { result.IsHaplotype1Allele2[i, j] = b[c++]; result.IsHaplotype2Allele2[i, j] = b[c++]; j++; } } } } else { for (int i = 0; i < result.Individual.Count; i++) { int j = 0; while (j < result.Locus.Count) { var b = ReadByte(); int c = 0; while (c < 7 && j < result.Locus.Count) { result.IsHaplotype1Allele2[j, i] = b[c++]; result.IsHaplotype2Allele2[j, i] = b[c++]; j++; } } } } } finally { _reader.Close(); _reader = null; } return(result); }
public void OpenBinaryFile(string fileName) { var famFile = FileUtils.ChangeExtension(fileName, ".fam"); if (!File.Exists(famFile)) { throw new FileNotFoundException("File not found: " + famFile); } var bimFile = FileUtils.ChangeExtension(fileName, ".bim"); if (!File.Exists(bimFile)) { throw new FileNotFoundException("File not found: " + bimFile); } Data = new PlinkData(); Data.Individual = PlinkIndividual.ReadFromFile(famFile); Data.Locus = PlinkLocus.ReadFromBimFile(bimFile); //Data.Locus.ForEach(m => m.MarkerId = m.MarkerId.ToLower()); Data.BuildMap(); DoOpenFile(fileName); BitArray b = ReadByte(); bool v1_bfile = true; if ((b[2] && b[3] && b[5] && b[6]) && !(b[0] || b[1] || b[4] || b[7])) { // Next number b = ReadByte(); if ((b[0] && b[1] && b[3] && b[4]) && !(b[2] || b[5] || b[6] || b[7])) { b = ReadByte(); IsSNPMajor = b[0]; if (IsSNPMajor) { Progress.SetMessage("Detected that binary PED file is v1.00 SNP-major mode\n"); } else { Progress.SetMessage("Detected that binary PED file is v1.00 individual-major mode\n"); } } else { v1_bfile = false; } } else { v1_bfile = false; } // Reset file if < v1 if (!v1_bfile) { Progress.SetMessage("Warning, old BED file <v1.00 : will try to recover...\n"); DoOpenFile(fileName); b = ReadByte(); } // If 0.99 file format if ((!v1_bfile) && (b[1] || b[2] || b[3] || b[4] || b[5] || b[6] || b[7])) { Progress.SetMessage(" *** Possible problem: guessing that BED is < v0.99 *** "); Progress.SetMessage(" *** High chance of data corruption, spurious results *** "); IsSNPMajor = false; DoOpenFile(fileName); } else if (!v1_bfile) { IsSNPMajor = b[0]; Progress.SetMessage("Binary PED file is v0.99\n"); if (IsSNPMajor) { Progress.SetMessage("Detected that binary PED file is in SNP-major mode\n"); } else { Progress.SetMessage("Detected that binary PED file is in individual-major mode\n"); } } _startPosition = _reader.BaseStream.Position; }
/// <summary> /// Read locus from map file of ped format /// </summary> /// <param name="fileName">map file</param> /// <returns>list of PlinkLocus</returns> public static List<PlinkLocus> ReadFromMapFile(string fileName) { var result = new List<PlinkLocus>(); using (var sr = new StreamReader(fileName)) { string line; while ((line = sr.ReadLine()) != null) { line = line.Trim(); if (string.IsNullOrEmpty(line)) { continue; } var parts = line.Split('\t'); if (string.IsNullOrEmpty(parts[1])) { continue; } var locus = new PlinkLocus(); locus.Chromosome = int.Parse(parts[0]); locus.MarkerId = parts[1]; locus.GeneticDistance = int.Parse(parts[2]); locus.PhysicalPosition = int.Parse(parts[3]); if (parts.Length >= 6) { locus.Allele1 = parts[4]; locus.Allele2 = parts[5]; } else { locus.Allele1 = MISSING; locus.Allele2 = MISSING; } result.Add(locus); } } return result; }