public override IEnumerable <string> Process() { using (var file = new PlinkBedRandomFile(_options.InputFile) { Progress = this.Progress }) { var locusList = file.Data.Locus; var individualList = file.Data.Individual; Progress.SetRange(0, locusList.Count); for (int i = 0; i < locusList.Count; i++) { Progress.SetPosition(i); var locus = locusList[i]; var data = file.Read(locus.MarkerId); int count1 = 0; int count2 = 0; int validSample = 0; for (int j = 0; j < individualList.Count; j++) { if (PlinkData.IsMissing(data[0, j], data[1, j])) { continue; } validSample++; if (data[0, j]) { count2++; } else { count1++; } if (data[1, j]) { count2++; } else { count1++; } } locus.Allele1Frequency = ((double)(count1)) / (count1 + count2); locus.TotalSample = individualList.Count; locus.ValidSample = validSample; } PlinkLocus.WriteToFile(_options.OutputFile, locusList, false, true, true); } return(new string[] { _options.OutputFile }); }
private static void Validate(PlinkData data) { Assert.AreEqual(2, data.Locus.Count); Assert.AreEqual(7, data.Individual.Count); Assert.AreEqual("GCCCGC0", data.LocusAllele1(0)); Assert.AreEqual("GGGCGG0", data.LocusAllele2(0)); Assert.AreEqual("ATTTA0T", data.LocusAllele1(1)); Assert.AreEqual("ATAAA0A", data.LocusAllele2(1)); Assert.AreEqual("2,1,1,0,2,1,3", data.LocusGenoType(0, ",")); Assert.AreEqual("2,0,1,1,2,3,1", data.LocusGenoType(1, ",")); }
public PlinkData ReadFromFile(string fileName) { var famFile = FileUtils.ChangeExtension(fileName, ".fam"); if (!File.Exists(famFile)) { throw new FileNotFoundException("File not found: " + famFile); } var bimFile = FileUtils.ChangeExtension(fileName, ".bim"); if (!File.Exists(bimFile)) { throw new FileNotFoundException("File not found: " + bimFile); } var result = new PlinkData(); result.Individual = PlinkIndividual.ReadFromFile(famFile); result.Locus = PlinkLocus.ReadFromBimFile(bimFile); result.AllocateDataMemory(); OpenBinaryFile(fileName); try { if (IsSNPMajor) { for (int i = 0; i < result.Locus.Count; i++) { int j = 0; while (j < result.Individual.Count) { var b = ReadByte(); int c = 0; while (c < 7 && j < result.Individual.Count) { result.IsHaplotype1Allele2[i, j] = b[c++]; result.IsHaplotype2Allele2[i, j] = b[c++]; j++; } } } } else { for (int i = 0; i < result.Individual.Count; i++) { int j = 0; while (j < result.Locus.Count) { var b = ReadByte(); int c = 0; while (c < 7 && j < result.Locus.Count) { result.IsHaplotype1Allele2[j, i] = b[c++]; result.IsHaplotype2Allele2[j, i] = b[c++]; j++; } } } } } finally { _reader.Close(); _reader = null; } return result; }
public override IEnumerable <string> Process() { var result = new List <string>(); var bimfile = Path.ChangeExtension(options.InputFile, ".bim"); var snps = PlinkLocus.ReadFromBimFile(bimfile, false, false); snps.RemoveAll(m => IsIndel(m) || IsMissing(m)); var snpItems = (from snp in snps select new SNPItem() { Chrom = snp.Chromosome, Name = snp.MarkerId, Position = snp.PhysicalPosition, Allele1 = snp.Allele1[0], Allele2 = snp.Allele2 }).ToList(); var nameMap = snpItems.FillDbsnpIdByPosition(options.DbsnpFile, this.Progress); using (var sw = new StreamWriter(options.OutputPrefix + ".namemap")) { sw.WriteLine("NewName\tOldName"); foreach (var n in nameMap) { sw.WriteLine("{0}\t{1}", n.Key, n.Value); } } //remove all snps without corresponding dbsnp entry snpItems.RemoveAll(m => m.DbsnpRefAllele == ' '); var nameDic = snpItems.ToGroupDictionary(m => m.Name); foreach (var n in nameDic) { if (n.Value.Count > 1) { Console.Error.WriteLine("Duplicated SNP:" + n.Key); foreach (var v in n.Value) { Console.Error.WriteLine("{0}:{1}-{2}:{3},{4}:{5},{6}", n.Key, v.Chrom, v.Position, v.Allele1, v.Allele2, v.DbsnpRefAllele, v.DbsnpAltAllele); } } } if (File.Exists(options.G1000File)) { snpItems.FindAllele2FrequencyFrom1000GomeByName(options.G1000File, this.Progress); } if (File.Exists(options.FastaFile)) { snpItems.FillReferenceAlleleFromFasta(options.FastaFile, this.Progress); } Dictionary <string, StrandAction> actionMap = new Dictionary <string, StrandAction>(); var statFile = options.OutputPrefix + ".stat"; result.Add(statFile); using (var sw = new StreamWriter(statFile)) { sw.WriteLine("Name\tChromosome\tPosition\tSource_Allele1\tSource_Allele2\tReference_Allele\tDbsnp_RefAllele\tDbsnp_AltAllele\tDbsnp_IsReversed\tG1000_RefAllele\tG1000_AltAllele\tG1000_MAF\tAction"); foreach (var v in snpItems) { StrandAction action = v.SuggestAction(); sw.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11:0.####}\t{12}", v.Name, v.Chrom, v.Position, v.Allele1, v.Allele2, v.RefChar, v.DbsnpRefAllele, v.DbsnpAltAllele, v.DbsnpIsReversed, v.G1000Allele1, v.G1000Allele2, v.G1000Allele2Frequency, action); actionMap[v.Name] = action; } } using (var reader = new PlinkBedRandomFile(options.InputFile) { Progress = this.Progress }) { var data = reader.Data; var chrs = (from v in snpItems select v.Chrom).Distinct().OrderBy(m => m).ToArray(); foreach (var chr in chrs) { var genfile = string.Format("{0}.{1}.gen", options.OutputPrefix, chr.ToString().PadLeft(2, '0')); result.Add(genfile); var map = FileUtils.ChangeExtension(genfile, ".sample"); new GwasSampleFormat().WriteToFile(map, data.Individual); //save gen file using (var sw = new StreamWriter(genfile)) { sw.NewLine = Environment.NewLine; var chrItems = snpItems.Where(m => m.Chrom == chr).ToList(); GenomeUtils.SortChromosome(chrItems, m => chr.ToString(), m => m.Position); foreach (var snp in chrItems) { var ldata = reader.Read(nameMap[snp.Name]); var action = actionMap[snp.Name]; sw.Write("{0} {1} {2} {3} {4}", snp.Chrom, snp.Name, snp.Position, snp.DbsnpRefAllele, snp.DbsnpAltAllele); for (int individualIndex = 0; individualIndex < data.Individual.Count; individualIndex++) { if (PlinkData.IsMissing(ldata[0, individualIndex], ldata[1, individualIndex])) { sw.Write(" 0 0 0"); } else { char alle1, alle2; if (StrandAction.Switch == action || StrandAction.FlipSwitch == action) { alle1 = ldata[0, individualIndex] ? snp.DbsnpAltAllele : snp.DbsnpRefAllele; alle2 = ldata[1, individualIndex] ? snp.DbsnpAltAllele : snp.DbsnpRefAllele; } else { alle1 = ldata[0, individualIndex] ? snp.DbsnpRefAllele : snp.DbsnpAltAllele; alle2 = ldata[1, individualIndex] ? snp.DbsnpRefAllele : snp.DbsnpAltAllele; } if (alle1 != alle2) { sw.Write(" 0 1 0"); } else if (alle1 == snp.DbsnpRefAllele) { sw.Write(" 1 0 0"); } else { sw.Write(" 0 0 1"); } } } sw.WriteLine(); } } } } return(result); }
public PlinkData ReadFromFile(string fileName) { var famFile = FileUtils.ChangeExtension(fileName, ".fam"); if (!File.Exists(famFile)) { throw new FileNotFoundException("File not found: " + famFile); } var bimFile = FileUtils.ChangeExtension(fileName, ".bim"); if (!File.Exists(bimFile)) { throw new FileNotFoundException("File not found: " + bimFile); } var result = new PlinkData(); result.Individual = PlinkIndividual.ReadFromFile(famFile); result.Locus = PlinkLocus.ReadFromBimFile(bimFile); result.AllocateDataMemory(); OpenBinaryFile(fileName); try { if (IsSNPMajor) { for (int i = 0; i < result.Locus.Count; i++) { int j = 0; while (j < result.Individual.Count) { var b = ReadByte(); int c = 0; while (c < 7 && j < result.Individual.Count) { result.IsHaplotype1Allele2[i, j] = b[c++]; result.IsHaplotype2Allele2[i, j] = b[c++]; j++; } } } } else { for (int i = 0; i < result.Individual.Count; i++) { int j = 0; while (j < result.Locus.Count) { var b = ReadByte(); int c = 0; while (c < 7 && j < result.Locus.Count) { result.IsHaplotype1Allele2[j, i] = b[c++]; result.IsHaplotype2Allele2[j, i] = b[c++]; j++; } } } } } finally { _reader.Close(); _reader = null; } return(result); }
public void OpenBinaryFile(string fileName) { var famFile = FileUtils.ChangeExtension(fileName, ".fam"); if (!File.Exists(famFile)) { throw new FileNotFoundException("File not found: " + famFile); } var bimFile = FileUtils.ChangeExtension(fileName, ".bim"); if (!File.Exists(bimFile)) { throw new FileNotFoundException("File not found: " + bimFile); } Data = new PlinkData(); Data.Individual = PlinkIndividual.ReadFromFile(famFile); Data.Locus = PlinkLocus.ReadFromBimFile(bimFile); //Data.Locus.ForEach(m => m.MarkerId = m.MarkerId.ToLower()); Data.BuildMap(); DoOpenFile(fileName); BitArray b = ReadByte(); bool v1_bfile = true; if ((b[2] && b[3] && b[5] && b[6]) && !(b[0] || b[1] || b[4] || b[7])) { // Next number b = ReadByte(); if ((b[0] && b[1] && b[3] && b[4]) && !(b[2] || b[5] || b[6] || b[7])) { b = ReadByte(); IsSNPMajor = b[0]; if (IsSNPMajor) Progress.SetMessage("Detected that binary PED file is v1.00 SNP-major mode\n"); else Progress.SetMessage("Detected that binary PED file is v1.00 individual-major mode\n"); } else v1_bfile = false; } else v1_bfile = false; // Reset file if < v1 if (!v1_bfile) { Progress.SetMessage("Warning, old BED file <v1.00 : will try to recover...\n"); DoOpenFile(fileName); b = ReadByte(); } // If 0.99 file format if ((!v1_bfile) && (b[1] || b[2] || b[3] || b[4] || b[5] || b[6] || b[7])) { Progress.SetMessage(" *** Possible problem: guessing that BED is < v0.99 *** "); Progress.SetMessage(" *** High chance of data corruption, spurious results *** "); IsSNPMajor = false; DoOpenFile(fileName); } else if (!v1_bfile) { IsSNPMajor = b[0]; Progress.SetMessage("Binary PED file is v0.99\n"); if (IsSNPMajor) Progress.SetMessage("Detected that binary PED file is in SNP-major mode\n"); else Progress.SetMessage("Detected that binary PED file is in individual-major mode\n"); } _startPosition = _reader.BaseStream.Position; }
public void OpenBinaryFile(string fileName) { var famFile = FileUtils.ChangeExtension(fileName, ".fam"); if (!File.Exists(famFile)) { throw new FileNotFoundException("File not found: " + famFile); } var bimFile = FileUtils.ChangeExtension(fileName, ".bim"); if (!File.Exists(bimFile)) { throw new FileNotFoundException("File not found: " + bimFile); } Data = new PlinkData(); Data.Individual = PlinkIndividual.ReadFromFile(famFile); Data.Locus = PlinkLocus.ReadFromBimFile(bimFile); //Data.Locus.ForEach(m => m.MarkerId = m.MarkerId.ToLower()); Data.BuildMap(); DoOpenFile(fileName); BitArray b = ReadByte(); bool v1_bfile = true; if ((b[2] && b[3] && b[5] && b[6]) && !(b[0] || b[1] || b[4] || b[7])) { // Next number b = ReadByte(); if ((b[0] && b[1] && b[3] && b[4]) && !(b[2] || b[5] || b[6] || b[7])) { b = ReadByte(); IsSNPMajor = b[0]; if (IsSNPMajor) { Progress.SetMessage("Detected that binary PED file is v1.00 SNP-major mode\n"); } else { Progress.SetMessage("Detected that binary PED file is v1.00 individual-major mode\n"); } } else { v1_bfile = false; } } else { v1_bfile = false; } // Reset file if < v1 if (!v1_bfile) { Progress.SetMessage("Warning, old BED file <v1.00 : will try to recover...\n"); DoOpenFile(fileName); b = ReadByte(); } // If 0.99 file format if ((!v1_bfile) && (b[1] || b[2] || b[3] || b[4] || b[5] || b[6] || b[7])) { Progress.SetMessage(" *** Possible problem: guessing that BED is < v0.99 *** "); Progress.SetMessage(" *** High chance of data corruption, spurious results *** "); IsSNPMajor = false; DoOpenFile(fileName); } else if (!v1_bfile) { IsSNPMajor = b[0]; Progress.SetMessage("Binary PED file is v0.99\n"); if (IsSNPMajor) { Progress.SetMessage("Detected that binary PED file is in SNP-major mode\n"); } else { Progress.SetMessage("Detected that binary PED file is in individual-major mode\n"); } } _startPosition = _reader.BaseStream.Position; }