public void WhenIOpenTheFile() { //ScenarioContext.Current.Pending(); Vcp = new VCFParser(VcfFile); Assert.IsNotNull(Vcp); }
private bool tmpLoadVCF(string fileName) { if (!ValidateFileName(fileName, out fileName)) { return(false); } _parserSTW.Restart(); VCFParser <Variant, VariantData> vcfParser = new VCFParser <Variant, VariantData>(source: fileName, species: Genomes.HomoSapiens, assembly: Assemblies.hg19); try { Repository.parsedVariants = vcfParser.Parse(); } catch (Exception e) { if (Path.GetDirectoryName(fileName) + Path.DirectorySeparatorChar == _workingDirectory && Path.GetExtension(fileName) == _logFile) { Herald.Announce(Herald.MessageType.Error, string.Format("The requested extension should not have same extension as the log file.")); } else { Herald.Announce(Herald.MessageType.Error, string.Format("{0}", e.Message)); } return(false); } _parserSTW.Stop(); _accumulatedLoadET += _parserSTW.Elapsed.TotalSeconds; Herald.AnnounceExeReport("Loaded", new ExecutionReport(Repository.parsedVariants.intervalsCount, _parserSTW.Elapsed)); return(true); }
public void OneTranscriptOneHeterozygousSynonymous() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_sample.fa")); VCFParser vcf = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestVcfs", "chr_1_one_heterozygous_synonymous.vcf")); List <Variant> variants = vcf.Select(x => new Variant(null, x, genome)).ToList(); Assert.AreEqual(1, variants.Count); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "chr1_one_transcript.gtf")); List <Protein> proteins_wo_variant = geneModel.Translate(true).ToList(); List <Transcript> transcripts = geneModel.ApplyVariants(variants); List <Protein> proteins = transcripts.Select(t => t.Protein()).ToList(); Assert.AreEqual(1, geneModel.Genes.Count); Assert.AreEqual(1, proteins.Count); Assert.AreEqual(1, proteins_wo_variant.Count); Assert.AreEqual(1, new HashSet <string> { proteins[0].BaseSequence, proteins_wo_variant[0].BaseSequence }.Count); Assert.IsTrue(proteins.Any(p => p.FullName.Contains(FunctionalClass.SILENT.ToString()))); // synonymous Assert.IsTrue(proteins.Any(p => p.FullName.Contains(GenotypeType.HETEROZYGOUS.ToString()))); // synonymous Assert.IsTrue(proteins.Any(p => p.FullName.Contains("1:69666"))); string proteinFasta = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestVcfs", "chr_1_one_heterozygous_synonymous.fasta"); ProteinDbWriter.WriteFastaDatabase(proteins, proteinFasta, " "); string[] proteinFastaLines = File.ReadLines(proteinFasta).ToArray(); Assert.IsTrue(proteinFastaLines[0].Contains(FunctionalClass.SILENT.ToString())); // synonymous Assert.IsTrue(proteinFastaLines[0].Contains("1:69666")); }
public void Chr19VariantTranscript() { Genome genome = new Genome(Path.Combine(TestContext.CurrentContext.TestDirectory, "Homo_sapiens.GRCh38.dna.chromosome.19.fa")); GeneModel geneModel = new GeneModel(genome, Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "chr19variantTranscript.gff3")); var variants = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "ProblematicChr19", "chr19problematic.vcf")) .Select(v => new Variant(null, v, genome.Chromosomes[0])) .Where(v => v.SecondAlleleString.Length == 1 && v.ReferenceAlleleString.Length == 1).ToList(); List <Transcript> transcripts = geneModel.ApplyVariants(variants).ToList(); List <Protein> proteins = transcripts.Select(t => t.Protein(null)).ToList(); }
public void ParseVcf([Values("testData/NA12878.knowledgebase.snapshot.20131119.b37.vcf.gz")] string fname) { Assert.IsTrue(File.Exists(fname)); VCFParser vcp = new VCFParser(fname); var j = vcp.First(); var ii = j.Genotypes; var i = vcp.Select(x => x.NoCallCount).Count(); Console.WriteLine("Count: {0}", i); }
static void Main(string[] args) { fname = "testData/NA12878.knowledgebase.snapshot.20131119.b37.vcf.gz"; System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch(); sw.Start(); VCFParser vcp = new VCFParser(fname); var j = vcp.First(); var ii = j.Genotypes; var i = vcp.Select(x => x.NoCallCount).Count(); sw.Stop(); Console.WriteLine(sw.Elapsed.ToString()); }
static void Main (string[] args) { var fname = "testData/NA12878.knowledgebase.snapshot.20131119.b37.vcf.gz"; System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch (); sw.Start (); VCFParser vcp = new VCFParser (fname); var j = vcp.First (); var ii = j.Genotypes.Count; var i = vcp.Select (x => x.NoCallCount).Count (); sw.Stop (); Console.WriteLine("Count: {0}/{1}", ii,i); Console.WriteLine ("Elapsed: {0}", sw.Elapsed); var anyKey = Console.ReadKey(); }
public void TestMissenseMutation() { // Make a transcript Sequence seq = new Sequence(Alphabets.DNA, "AAA".Select(cc => (byte)cc).ToArray(), false); seq.ID = "1"; Chromosome c = new Chromosome(seq, null); Gene g = new Gene("", c, "", "+", 1, 3, null); Transcript t = new Transcript("", g, "", "+", 1, 3, "", null, null); Exon x = new Exon(t, seq, "", 1, 3, seq.ID, "+", null, null); t.Exons = new List <Exon> { x }; CDS cds = new CDS(t, seq.ID, "", "+", 1, 3, null, 0); t.CodingDomainSequences = new List <CDS> { cds }; // Make a missense mutation // ugh.vcf has a homozygous variation that should change the codon from AAA to AGA, which code for K and R // # CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample // 1 2 . A G 64.77 . info GT:AD:DP:GQ:PL 1/1:2,3:5:69:93,0,69 List <Variant> variants = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, "TestVcfs", "ugh.vcf")).Select(v => new Variant(null, v, new Chromosome(seq, null))).ToList(); // Make sure it makes it into the DNA sequence t.Variants = new HashSet <Variant>(variants); List <Transcript> variantTranscripts = GeneModel.ApplyVariantsCombinitorially(t); Assert.AreEqual("AAA", SequenceExtensions.ConvertToString(t.Exons[0].Sequence)); Assert.AreEqual("K", t.Protein().BaseSequence); Assert.AreEqual("AGA", SequenceExtensions.ConvertToString(variantTranscripts[0].Exons[0].Sequence)); Assert.AreEqual("R", variantTranscripts[0].Protein().BaseSequence); // Make sure it gets annotated as a missense mutation Assert.IsTrue(variantTranscripts[0].VariantAnnotations.Any(str => str.Contains(FunctionalClass.MISSENSE.ToString()))); }
public override void LoadData(string filePath) { var vcfParser = new VCFParser(filePath); var header = vcfParser.Header; var collectionInformationList = new List<patient_variant_information>(); var patient = new patient(); // We pull out all of the metadata from the header (all lines) and write them as information // lines associated with this result. foreach (var headerItem in header.MetaDataInInputOrder) { if (headerItem.Key == "individual-id") { var individualParts = headerItem.Value.Replace("<", "").Replace(">", "").Split(new char[] { ',' }); var individualData = individualParts.Select(x => x.Split(new char[] { '=' })).ToArray(); var mrnParts = individualData.FirstOrDefault(x => x[0] == "Dbxref")[1].Split(':'); patient = patientRepo.AddPatient(mrnParts[1], mrnParts[0], individualData.FirstOrDefault(x => x[0] == "First_name")[1], individualData.FirstOrDefault(x => x[0] == "Last_name")[1], DateTime.Parse(individualData.FirstOrDefault(x => x[0] == "DOB")[1])); } else if (headerItem.GetType() == typeof(VCFInfoHeaderLine)) { var info = headerItem as VCFInfoHeaderLine; collectionInformationList.Add(AddHeaderInformation(string.Format("VCF:{0}", headerItem.Key), CleanHeaderValue("INFO", info.ToString()))); } else if (headerItem.GetType() == typeof(VCFFilterHeaderLine)) { var filter = headerItem as VCFFilterHeaderLine; collectionInformationList.Add(AddHeaderInformation(string.Format("VCF:{0}", headerItem.Key), CleanHeaderValue("FILTER", filter.ToString()))); } else if (headerItem.GetType() == typeof(VCFFormatHeaderLine)) { var format = headerItem as VCFFormatHeaderLine; collectionInformationList.Add(AddHeaderInformation(string.Format("VCF:{0}", headerItem.Key), CleanHeaderValue("FORMAT", format.ToString()))); } else { collectionInformationList.Add(AddHeaderInformation(string.Format("VCF:{0}", headerItem.Key), headerItem.Value)); } } var reference = header.MetaDataInInputOrder.First(x => x.Key == "reference").Value; DateTime? resultDate = DateTime.ParseExact(header.MetaDataInInputOrder.First(x => x.Key == "fileDate").Value, "yyyyMMdd", CultureInfo.InvariantCulture, DateTimeStyles.None); var patientVariants = new List<patient_variants>(); var featureInformationList = new Dictionary<patient_variants, List<patient_variant_information>>(); while (vcfParser.MoveNext()) { var current = vcfParser.Current; var variant = variantRepo.AddVariant(null, current.ID, "dbSNP", current.Chr, current.Start, current.End, reference, current.Reference.BaseString); var patientVariant = new patient_variants() { patient_id = patient.id, reference_id = variant.id, resulted_on = resultDate, variant_type = Enums.PatientVariantType.SNP }; SetVariantValues(patientVariant, current); patientVariants.Add(patientVariant); var attributeList = new List<patient_variant_information>(); foreach (var attribute in current.Attributes) { attributeList.Add(AddVariantInformation(string.Format("VCF:{0}", attribute.Key), attribute.Value.ToString())); } if (current.FiltersMaybeNull != null) { foreach (var filter in current.FiltersMaybeNull) { attributeList.Add(AddVariantInformation("VCF:Filter", filter)); } } foreach (var genotype in current.Genotypes) { attributeList.Add(AddVariantInformation("VCF:Genotype", genotype.ToMHGRString())); } attributeList.Add(AddVariantInformation("VCF:Quality", current.PhredScaledQual.ToString())); attributeList.Add(AddVariantInformation("VCF:Filter", string.Join(",", current.Filters.ToArray()))); featureInformationList.Add(patientVariant, attributeList); } // Save the collection to get its ID var source = sourceRepo.AddSource("VCF", "VCF file"); var file = AddResultFile(filePath, source); var collection = patientRepo.AddCollection(patient, file); // Save the collection-level header data collectionInformationList.ForEach(x => x.item_id = collection.id); variantRepo.AddPatientVariantInformationList(collectionInformationList); variantRepo.AddPatientVariants(patientVariants); // Save the individual attributes associated with each feature. // Must be done after the patient variants are written to DB (above), since we // rely on the ID being set. foreach (var pair in featureInformationList) { foreach (var attribute in pair.Value) { attribute.item_id = pair.Key.id; } variantRepo.AddPatientVariantInformationList(pair.Value); } variantRepo.AddPatientVariantsToCollection(collection, patientVariants); featureInformationList.Clear(); collectionInformationList.Clear(); patientVariants.Clear(); }
public void test() { VCFParser parser = new VCFParser(Path.Combine(TestContext.CurrentContext.TestDirectory, @"testData", @"NA12878.knowledgebase.snapshot.20131119.b37.vcf.gz")); List <VariantContext> context = parser.Select(x => x).ToList(); }
public override void LoadData(string filePath) { var vcfParser = new VCFParser(filePath); var header = vcfParser.Header; var patient = new patient(); var source = sourceRepo.AddSource("VCF", "VCF file"); var file = AddResultFile(filePath, source); // Process the file-level pragmas result_entities rootEntity = new result_entities() { attribute_id = EntityRepository.GetAttribute(null, null, "Variant Call Format result", null).id, result_file_id = file.id }; // We pull out all of the metadata from the header (all lines) and write them as information // lines associated with this result. var headerEntities = new List<result_entities>(); foreach (var headerItem in header.MetaDataInInputOrder) { if (headerItem.Key == "individual-id") { var individualParts = headerItem.Value.Replace("<", "").Replace(">", "").Split(new char[] { ',' }); var individualData = individualParts.Select(x => x.Split(new char[] { '=' })).ToArray(); var mrnParts = individualData.FirstOrDefault(x => x[0] == "Dbxref")[1].Split(':'); patient = patientRepo.AddPatient(mrnParts[1], mrnParts[0], individualData.FirstOrDefault(x => x[0] == "First_name")[1], individualData.FirstOrDefault(x => x[0] == "Last_name")[1], DateTime.Parse(individualData.FirstOrDefault(x => x[0] == "DOB")[1])); } else if (headerItem.Key == "fileDate") { DateTime resultDate = DateTime.ParseExact(headerItem.Value, "yyyyMMdd", CultureInfo.InvariantCulture, DateTimeStyles.None); headerEntities.Add(CreateEntityAttribute("Resulted on", 0, file.id, rootEntity, resultDate.ToShortDateString())); } else if (headerItem.GetType() == typeof(VCFInfoHeaderLine)) { var info = headerItem as VCFInfoHeaderLine; var infoEntity = CreateEntityAttribute("INFO", 0, file.id, rootEntity, null); headerEntities.Add(infoEntity); headerEntities.Add(CreateEntityAttribute("ID", 0, file.id, infoEntity, info.ID)); headerEntities.Add(CreateEntityAttribute("Number", 0, file.id, infoEntity, info.CountType.ToString())); headerEntities.Add(CreateEntityAttribute("Type", 0, file.id, infoEntity, info.Type.ToString())); headerEntities.Add(CreateEntityAttribute("Description", 0, file.id, infoEntity, info.Description)); } else if (headerItem.GetType() == typeof(VCFFilterHeaderLine)) { var filter = headerItem as VCFFilterHeaderLine; var filterEntity = CreateEntityAttribute("FILTER", 0, file.id, rootEntity, null); headerEntities.Add(filterEntity); foreach (var field in filter.GenericFields()) { headerEntities.Add(CreateEntityAttribute(field.Key, 0, file.id, filterEntity, field.Value)); } } else if (headerItem.GetType() == typeof(VCFFormatHeaderLine)) { var format = headerItem as VCFFormatHeaderLine; var formatEntity = CreateEntityAttribute("FORMAT", 0, file.id, rootEntity, null); headerEntities.Add(formatEntity); headerEntities.Add(CreateEntityAttribute("ID", 0, file.id, formatEntity, format.ID)); headerEntities.Add(CreateEntityAttribute("Number", 0, file.id, formatEntity, format.CountType.ToString())); headerEntities.Add(CreateEntityAttribute("Type", 0, file.id, formatEntity, format.Type.ToString())); headerEntities.Add(CreateEntityAttribute("Description", 0, file.id, formatEntity, format.Description)); } else { var headerEntity = CreateEntityAttribute(headerItem.Key, 0, file.id, rootEntity, headerItem.Value); headerEntities.Add(headerEntity); } } rootEntity.patient_id = patient.id; headerEntities.ForEach(x => x.patient_id = patient.id); var variantEntities = new List<result_entities>(); while (vcfParser.MoveNext()) { var current = vcfParser.Current; result_entities variantEntity = new result_entities() { attribute_id = EntityRepository.GetAttribute(null, null, "Variant Call Format variant", null).id, result_file_id = file.id, patient_id = patient.id, parent = rootEntity }; variantEntities.Add(variantEntity); result_entities snpEntity = new result_entities() { attribute_id = EntityRepository.GetAttribute(current.ID, "dbSNP", null, null).id, result_file_id = file.id, patient_id = patient.id, parent = variantEntity }; variantEntities.Add(snpEntity); SetVariantValues(current, patient.id, file.id, snpEntity, variantEntities); variantEntities.Add(CreateEntityAttribute("Chromosome", patient.id, file.id, variantEntity, current.Chr)); variantEntities.Add(CreateEntityAttribute("Start position", patient.id, file.id, variantEntity, current.Start.ToString())); variantEntities.Add(CreateEntityAttribute("End position", patient.id, file.id, variantEntity, current.End.ToString())); variantEntities.Add(CreateEntityAttribute("Reference base", patient.id, file.id, variantEntity, current.Reference.BaseString)); variantEntities.Add(CreateEntityAttribute("Quality", patient.id, file.id, variantEntity, current.PhredScaledQual.ToString())); foreach (var attr in current.Attributes) { variantEntities.Add(CreateEntityAttribute(string.Format("INFO:{0}", attr.Key), patient.id, file.id, variantEntity, attr.Value.ToString())); } if (current.FiltersMaybeNull != null) { foreach (var filter in current.FiltersMaybeNull) { variantEntities.Add(CreateEntityAttribute(string.Format("FILTER:{0}", filter), patient.id, file.id, variantEntity, string.Empty)); } } //foreach (var genotype in current.Genotypes) //{ // attributeList.Add(AddVariantInformation("VCF:Genotype", genotype.ToMHGRString())); //} //attributeList.Add(AddVariantInformation("VCF:Quality", current.PhredScaledQual.ToString())); //attributeList.Add(AddVariantInformation("VCF:Filter", string.Join(",", current.Filters.ToArray()))); //featureInformationList.Add(patientVariant, attributeList); } entityRepo.AddVCF(rootEntity, headerEntities, variantEntities); }