/// <summary> /// Step forward with the reader, assembling a list of variants at your CurrentVariant position. /// </summary> /// <param name="Reader"></param> /// <param name="CurrentVariant"></param> /// <param name="BackLogExists"></param> /// <param name="TheBackLog"></param> /// <returns></returns> private static List <CalledAllele> AssembleColocatedList( VcfReader Reader, CalledAllele CurrentVariant, bool mFirst, ref bool BackLogExists, ref List <CalledAllele> TheBackLog) { List <CalledAllele> CoLocatedVariants = new List <CalledAllele>(); bool ContinueReadA = true; while (ContinueReadA) { var NextVariantList = new List <CalledAllele>(); if (BackLogExists) { NextVariantList = TheBackLog; BackLogExists = false; } else { VcfVariant NextVariant = new VcfVariant(); ContinueReadA = Reader.GetNextVariant(NextVariant); if (!ContinueReadA) { break; } NextVariantList = VcfVariantUtilities.Convert(new List <VcfVariant> { NextVariant }).ToList(); } // VarOrde = -1 if Current comes first, 0 if co-located. int VarOrder = (AlleleCompareByLoci.OrderAlleles(CurrentVariant, NextVariantList.First(), mFirst)); switch (VarOrder) { case 0: //the variant we just got is at out current position CoLocatedVariants.AddRange(NextVariantList); break; case -1: //the variant we just got is after our current position, and needs to go to the backlog. TheBackLog = NextVariantList; //NextVariant; ContinueReadA = false; BackLogExists = true; break; default: // { throw new InvalidDataException("Vcf needs to be ordered."); } } } if (!BackLogExists) { TheBackLog = null; } return(CoLocatedVariants); }
public static double?GetBAlleleFrequency(VcfVariant variant, int referenceCount, int variantCount) { double?baf = null; double totalAlleleCount = referenceCount + variantCount; if (totalAlleleCount < 1) { return(baf); } if (variant.ReferenceAllele.Equals(".") || variant.VariantAlleles[0].Equals(".")) { return(baf); } if (BAllelePreference(variant.ReferenceAllele) < BAllelePreference(variant.VariantAlleles[0])) { baf = referenceCount / totalAlleleCount; } else { baf = variantCount / totalAlleleCount; } return(baf); }
private static bool VariantsMatch(VcfVariant variant1, VcfVariant variant2) { return(variant1.ReferenceName == variant2.ReferenceName && variant1.ReferencePosition == variant2.ReferencePosition && variant1.ReferenceAllele == variant2.ReferenceAllele && variant1.VariantAlleles.First() == variant2.VariantAlleles.First()); }
public static bool HaveInfoToUpdateQ(VcfVariant originalVar, out int depth, out int callCount) { bool canUpdateQ = false; depth = -1; callCount = -1; if ((originalVar.InfoFields == null) || (originalVar.Genotypes == null) || (originalVar.Genotypes.Count < 1)) { return(false); } if (originalVar.InfoFields.ContainsKey("DP")) { canUpdateQ = int.TryParse(originalVar.InfoFields["DP"], out depth); } if (originalVar.Genotypes[0].ContainsKey("AD")) { string[] spat = originalVar.Genotypes[0]["AD"].Split(','); if (spat.Length == 2) { canUpdateQ = (canUpdateQ && int.TryParse(spat[1], out callCount)); } } return(canUpdateQ); }
public void OverlapWorks_DupDel([NotNull] string truthVar, [NotNull] string queryVar, string type, bool isTp) { const string sampleName = "blah"; var vcfVariantParserSettings = VcfVariantParserSettings.Create(new List <string> { sampleName }); var baseVariant = VcfVariant.TryParse(truthVar, vcfVariantParserSettings).GetOrThrow(); const bool isCrossTypeOn = true; var wittyerType = WittyerType.Parse(type); var inputSpecs = InputSpec.GenerateCustomInputSpecs(!isCrossTypeOn, new[] { wittyerType }, percentDistance: PercentDistance).ToDictionary(s => s.VariantType, s => s); var bndSet = new Dictionary <IGeneralBnd, IVcfVariant>(); var errorList = new List <string>(); var truthV = (IMutableWittyerSimpleVariant)WittyerVcfReader.CreateVariant(baseVariant, baseVariant.Samples.First().Value, true, sampleName, inputSpecs, bndSet, errorList, isCrossTypeOn); baseVariant = VcfVariant.TryParse(queryVar, vcfVariantParserSettings).GetOrThrow(); var queryV = (IMutableWittyerVariant)WittyerVcfReader.CreateVariant(baseVariant, baseVariant.Samples.First().Value, false, sampleName, inputSpecs, bndSet, errorList, isCrossTypeOn); var tree = TruthForest.Create(sampleName, VcfHeader.CreateBuilder(VcfVersion.FourPointOne).Build()); tree.AddTarget(truthV); OverlappingUtils.DoOverlapping(tree.VariantTrees, queryV, OverlappingUtils.IsVariantAlleleMatch, isCrossTypeOn, true); queryV.Finalize(WitDecision.FalsePositive, EvaluationMode.CrossTypeAndSimpleCounting, null); truthV.Finalize(WitDecision.FalseNegative, EvaluationMode.CrossTypeAndSimpleCounting, null); Assert.Equal(isTp ? WitDecision.TruePositive : WitDecision.FalsePositive, queryV.Sample.Wit); Assert.Equal(isTp ? WitDecision.TruePositive : WitDecision.FalseNegative, truthV.Sample.Wit); }
private bool TestVariant(VcfReader vr, VariantType type1, VariantType type2) { var testVar = new VcfVariant(); vr.GetNextVariant(testVar); return((testVar.VarType1 == type1) && (testVar.VarType2 == type2)); }
public Comparison(VcfVariant variant, bool inBaseline, bool inTest) { Variant = variant; InBaseline = inBaseline; InTest = inTest; ComparisonResults = new Dictionary <string, ComparisonResult>(); }
public void GenerateVcfStrings_IncludeHeaders() { if (MiscUtils.IsRunningAnyLinux) { return; // currently failing on linux :( } var parser = VcfVariantParserSettings.Create(ImmutableList.Create(SampleName)); var variants = VcfVariant.TryParse(Bnd1, parser).FollowedBy(VcfVariant.TryParse(Bnd2, parser)).EnumerateSuccesses().ToList(); var wittyerVariant = WittyerBndInternal.Create(variants[0], variants[0].ToTryOfGenotypedVcfVariant(VariantNormalizer.TrimCommonBases).GetOrThrow().Samples.Values.First(), WittyerType.IntraChromosomeBreakend, new List <uint>(), uint.MinValue, null, variants[1]); var headerLines = WittyerVcfWriter.GenerateVcfStrings( WittyerResult.Create(VcfHeader.CreateBuilder(VcfVersion.FourPointOne).Build(), SampleName, variants.Select(v => v.Contig).Distinct().ToList(), false, new Dictionary <WittyerType, IReadOnlyList <IWittyerVariant> >(), new Dictionary <WittyerType, IReadOnlyList <IWittyerBnd> > { { WittyerType.IntraChromosomeBreakend, new List <IWittyerBnd> { wittyerVariant } } }, new List <IVcfVariant>()), null, null) .TakeWhile(line => line.StartsWith(VcfConstants.Header.Prefix)).ToList(); // 11 = VcfVersion, WHO, WHAT, WHERE, WHY, WIT, WIN, WOW, date, version, column names Assert.Equal(11, headerLines.Count); }
public static string WriteCountsFile(string vcfIn, string outDir) { var variant = new VcfVariant(); var countsPath = Path.Combine(outDir, Path.GetFileName(vcfIn).Replace(".vcf", ".counts")); var counter = new MutationCounter(); using (VcfReader readerA = new VcfReader(vcfIn)) { counter.StartWriter(countsPath); while (readerA.GetNextVariant(variant)) { try { counter.Add(variant); } catch (Exception ex) { Logger.WriteToLog(string.Format("Fatal error processing vcf; Check {0}, position {1}. Exception: {2}", variant.ReferenceName, variant.ReferencePosition, ex)); throw; } } counter.CloseFalseCallsWriter(); } return(countsPath); }
public static void WittyerVariantIntervalCorrect([NotNull] string variant, uint start, uint end, uint posStart, uint posEnd, uint endStart, uint endEnd) { const string sampleName = "tumor"; var vcfVariant = VcfVariant.TryParse(variant, VcfVariantParserSettings.Create(ImmutableList.Create("normal", sampleName), GenomeAssembly.Hg38)) .GetOrThrowDebug(); var _ = WittyerType.ParseFromVariant(vcfVariant, false, sampleName, out var type); if (type == null) { throw new NotSupportedException("This test does not handle svType null"); } var wittyerVariant = WittyerVariantInternal .Create(vcfVariant, vcfVariant.Samples[sampleName], type, Bins, PercentDistance, BasepairDistance); var expectedStart = ContigAndInterval.Create(vcfVariant.Contig, start, end); var expectedPos = BedInterval.Create(posStart, posEnd); var expectedEnd = BedInterval.Create(endStart, endEnd); MultiAssert.Equal(expectedStart, wittyerVariant); MultiAssert.Equal(expectedPos, wittyerVariant.CiPosInterval); MultiAssert.Equal(expectedEnd, wittyerVariant.CiEndInterval); MultiAssert.AssertAll(); }
public void GetNextVariantTests() { var resultVariant = new VcfVariant(); string resultString = string.Empty; var vr = new VcfReader(VcfTestFile_1); vr.GetNextVariant(resultVariant, out resultString); Assert.Equal(resultString.TrimEnd('\r'), @"chr1 10 . A . 25 PASS DP=500 GT:GQ:AD:VF:NL:SB:NC 1/1:25:0,0:0.0000:23:0.0000:0.0010"); Assert.Equal(resultVariant.ReferenceName, "chr1"); Assert.Equal(resultVariant.ReferenceAllele, "A"); Assert.Equal(resultVariant.VariantAlleles.First(), "."); Assert.Equal(vr.Position(), 1452); var resultStringArray = new string[] {}; resultVariant = new VcfVariant(); vr.GetNextVariant(resultVariant, out resultString, out resultStringArray); Assert.Equal(resultString.TrimEnd('\r'), @"chr1 20 . A T 25 PASS DP=500 GT:GQ:AD:VF:NL:SB:NC 1/1:25:0,0:0.0000:23:0.0000:0.0010"); for (var i = 0; i < resultStringArray.Length; i++) { resultStringArray[i] = resultStringArray[i].TrimEnd('\r'); } Assert.Equal(resultStringArray, @"chr1 20 . A T 25 PASS DP=500 GT:GQ:AD:VF:NL:SB:NC 1/1:25:0,0:0.0000:23:0.0000:0.0010".Split('\t')); Assert.Equal(resultVariant.ReferenceName, "chr1"); resultVariant = new VcfVariant(); vr.GetNextVariant(resultVariant); Assert.Equal(resultVariant.ReferenceName, "chr1"); Assert.Equal(resultVariant.ReferenceAllele, "A"); Assert.Equal(resultVariant.VariantAlleles.First(), "AT"); }
private static double GetDP(VcfVariant variant) { double dp; variant.TryParseInfoDouble("DP", out dp); return(dp); }
protected int GetCopyNumber(VcfVariant variant, out int end) { int CN = -1; end = -1; if (variant.GenotypeColumns != null && variant.GenotypeColumns.Count > 0) { Dictionary <string, string> genotype = variant.GenotypeColumns[variant.GenotypeColumns.Count - 1]; if (genotype.ContainsKey("CN")) { CN = int.Parse(genotype["CN"]); } if (genotype.ContainsKey("END")) { end = int.Parse(genotype["END"]); } } if (variant.InfoFields.ContainsKey("END")) { end = int.Parse(variant.InfoFields["END"]); } if (variant.InfoFields.ContainsKey("CN")) { CN = int.Parse(variant.InfoFields["CN"]); } return(CN); }
private static bool VariantsMatch(VcfVariant variant1, VcfVariant variant2) { //Already assume these are from the same chromosome return(variant1.ReferencePosition == variant2.ReferencePosition && variant1.VariantAlleles[0] == variant2.VariantAlleles[0] && variant1.ReferenceAllele == variant2.ReferenceAllele); }
private static CandidateAllele Map(VcfVariant vcfVariant) { var alternateAllele = vcfVariant.VariantAlleles[0]; var type = AlleleCategory.Unsupported; if (!String.IsNullOrEmpty(vcfVariant.ReferenceAllele) && !String.IsNullOrEmpty(alternateAllele)) { if (vcfVariant.ReferenceAllele == alternateAllele) { type = AlleleCategory.Reference; } if (vcfVariant.ReferenceAllele.Length == alternateAllele.Length) { type = alternateAllele.Length == 1 ? AlleleCategory.Snv : AlleleCategory.Mnv; } else { if (vcfVariant.ReferenceAllele.Length == 1) { type = AlleleCategory.Insertion; } else if (alternateAllele.Length == 1) { type = AlleleCategory.Deletion; } } } return(new CandidateAllele(vcfVariant.ReferenceName, vcfVariant.ReferencePosition, vcfVariant.ReferenceAllele, alternateAllele, type)); }
public void GetNextVariantTests() { var resultVariant = new VcfVariant(); string resultString = string.Empty; var vr = new VcfReader(VcfTestFile_1); vr.GetNextVariant(resultVariant, out resultString); Assert.Equal(resultString.TrimEnd('\r'), @"chr1 10 . A . 25 PASS DP=500 GT:GQ:AD:VF:NL:SB:NC 1/1:25:0,0:0.0000:23:0.0000:0.0010"); Assert.Equal(resultVariant.ReferenceName, "chr1"); Assert.Equal(resultVariant.ReferenceAllele, "A"); Assert.Equal(resultVariant.VariantAlleles.First(), "."); //Note, we have seen this assert below fail for specific user configurations //When it fails the error mesg is as below: //Assert.Equal() Failure //Expected: 1428 //Actual: 1452 //If this happens to you, check your git attributes config file. //You might be handling vcf text file line endings differently so the white space counts differently in this test. // In that case, the fail is purely cosmetic. // //try: Auto detect text files and perform LF normalization //# http://davidlaing.com/2012/09/19/customise-your-gitattributes-to-become-a-git-ninja/ //*text = auto //*.cs diff = csharp //*.bam binary //*.vcf text //.fa text eol = crlf if (vr.Position() == 1428) { Console.WriteLine("This isn't critical, but you might want to change your line endings convention. "); Console.WriteLine("This project was developed with \\CR\\LF , not \\LF convention."); } else { Assert.Equal(1452, vr.Position()); } var resultStringArray = new string[] {}; resultVariant = new VcfVariant(); vr.GetNextVariant(resultVariant, out resultString, out resultStringArray); Assert.Equal(resultString.TrimEnd('\r'), @"chr1 20 . A T 25 PASS DP=500 GT:GQ:AD:VF:NL:SB:NC 1/1:25:0,0:0.0000:23:0.0000:0.0010"); for (var i = 0; i < resultStringArray.Length; i++) { resultStringArray[i] = resultStringArray[i].TrimEnd('\r'); } Assert.Equal(resultStringArray, @"chr1 20 . A T 25 PASS DP=500 GT:GQ:AD:VF:NL:SB:NC 1/1:25:0,0:0.0000:23:0.0000:0.0010".Split('\t')); Assert.Equal(resultVariant.ReferenceName, "chr1"); resultVariant = new VcfVariant(); vr.GetNextVariant(resultVariant); Assert.Equal(resultVariant.ReferenceName, "chr1"); Assert.Equal(resultVariant.ReferenceAllele, "A"); Assert.Equal(resultVariant.VariantAlleles.First(), "AT"); }
public static int OrderVariants(CalledAllele a, VcfVariant b, bool mFirst) { var vcfVariantA = new VcfVariant { ReferencePosition = a.Coordinate, ReferenceName = a.Chromosome }; return(Extensions.OrderVariants(vcfVariantA, b, mFirst)); }
public void ToStringNoValueWorks() { var parser = VcfVariantParserSettings.Create(ImmutableList.Create(SampleName)); var variant = VcfVariant.TryParse(NoValueUnsorted, parser).GetOrThrow(); var actual = WittyerVcfWriter.ToString(variant, null); Assert.Equal(NoValueSorted, actual); }
public void ToStringBnd() { var parser = VcfVariantParserSettings.Create(ImmutableList.Create(SampleName)); var variant = VcfVariant.TryParse(Bnd1, parser).GetOrThrow(); var actual = WittyerVcfWriter.ToString(variant, null); Assert.Equal(Bnd1, actual); }
public static bool CheckForMatch(RefPanelEntry entry, VcfVariant var) { bool match = (entry.Chr == var.ReferenceName) && (entry.FwdStandFirstPositionOfMutation == var.ReferencePosition) && (entry.FwdStrandRefAllele == var.ReferenceAllele) && (entry.FwdStrandAltAllele == var.VariantAlleles[0]); return(match); }
public static void ParseReferenceVariantWorks([NotNull] string inputVariant) { var vcfVariant = VcfVariant.TryParse(inputVariant, VcfVariantParserSettings.Create(ImmutableList.Create("NA12878", "haha"), GenomeAssembly.Hg19)) .GetOrThrowDebug(); WittyerType.ParseFromVariant(vcfVariant, false, "NA12878", out var actualType); Assert.Equal(WittyerType.CopyNumberReference, actualType); }
public void GetInsertionIntervalNoLenIns() { var variant = VcfVariant .TryParse(UnknownLength, VcfVariantParserSettings.Create(ImmutableList.Create("blah"))).GetOrThrow(); var bedInterval = WittyerBndInternal.GetInsertionInterval(variant); MultiAssert.Equal(null, bedInterval?.GetLength()); MultiAssert.AssertAll(); }
/// <summary> /// Writes the variant /// </summary> public void WriteVariant(VcfVariant variant) { // sanity check if (!IsOpen) { throw new ApplicationException("ERROR: An attempt was made to write a variant to an unopened file."); } _writer.WriteLine(variant.ToString()); }
private static IWittyerBnd CreateWittyerBnd([NotNull] string bndLine1, [NotNull] string bndLine2) { var variant = VcfVariant.TryParse(bndLine1, VcfVariantParserSettings.Create(ImmutableList.Create("normal"), GenomeAssembly.Grch37)) .GetOrThrowDebug(); return(WittyerBndInternal.Create(variant, variant.Samples["normal"], WittyerType.TranslocationBreakend, Bins, BasepairDistance, PercentDistance, VcfVariant.TryParse(bndLine2, VcfVariantParserSettings.Create(ImmutableList.Create("normal"), GenomeAssembly.Grch37)) .GetOrThrowDebug())); }
public void TestGetBAlleleFrequency(string refAllele, string altAllele, int refCount, int altCount, double?expectedFreq) { VcfVariant variant = new VcfVariant(); variant.ReferenceAllele = refAllele; variant.VariantAlleles = new string[] { altAllele }; double?freq = SNVReviewer.GetBAlleleFrequency(variant, refCount, altCount); Assert.Equal(expectedFreq, freq); }
public void VennVcf_CombineTwoPoolVariants_MergeRefCalls() { //this is from an issue where there were multiple co-located variants in one pool, //and just ref in the other, at chr15 92604460. The consensus answer should be // a single ref call (and not multiple ref calls!). var outDir = TestPaths.LocalScratchDirectory; var vcfPathRoot = _TestDataPath; string VcfPath_PoolA = Path.Combine(vcfPathRoot, "C64-Ct-4_S17.genome.vcf"); string VcfPath_PoolB = Path.Combine(vcfPathRoot, "C64-Ct-4_S18.genome.vcf"); string VcfPath_Consensus = Path.Combine(vcfPathRoot, "ExpectedConsensus2.vcf"); string OutputPath = Path.Combine(outDir, "Consensus2.vcf"); if (File.Exists(OutputPath)) { File.Delete(OutputPath); } VennVcfOptions parameters = new VennVcfOptions(); parameters.VariantCallingParams.MinimumFrequencyFilter = 0.03f; parameters.InputFiles = new string[] { VcfPath_PoolA, VcfPath_PoolB }; parameters.OutputDirectory = outDir; //Path.Combine(outDir, "RefMergeOut.vcf"); parameters.ConsensusFileName = OutputPath; VennProcessor venn = new VennProcessor(parameters.InputFiles, parameters); venn.DoPairwiseVenn(false); Assert.Equal(File.Exists(OutputPath), true); List <VcfVariant> CombinedVariants = VcfReader.GetAllVariantsInFile(OutputPath); List <VcfVariant> ExpectedVariants = VcfReader.GetAllVariantsInFile(VcfPath_Consensus); Assert.Equal(ExpectedVariants.Count, CombinedVariants.Count); int NumVariantsAtPos92604460 = 0; for (int i = 0; i < ExpectedVariants.Count; i++) { VcfVariant EVariant = ExpectedVariants[i]; VcfVariant Variant = CombinedVariants[i]; if ((Variant.ReferencePosition == 92604460) && (Variant.ReferenceName == "chr15")) { NumVariantsAtPos92604460++; } Assert.Equal(EVariant.ToString(), Variant.ToString()); } Assert.Equal(NumVariantsAtPos92604460, 1); }
public static void CheckVariantsMatch(VcfVariant baseline, CalledAllele test) { Assert.Equal(baseline.ReferenceAllele, test.ReferenceAllele); Assert.Equal(baseline.VariantAlleles[0], test.AlternateAllele); Assert.Equal(baseline.VariantAlleles.Length, 1); Assert.Equal(baseline.ReferenceName, test.Chromosome); Assert.Equal(baseline.ReferencePosition, test.ReferencePosition); int numAlts = (baseline.VariantAlleles[0] == ".") ? 0 : baseline.VariantAlleles.Length; Assert.Equal(VcfVariantUtilities.MapGTString(baseline.Genotypes[0]["GT"], numAlts), test.Genotype); }
private IEnumerable <CalledAllele> GetNextBlockOfOriginalAllelesFromVcfVar() { var vcfVar = new VcfVariant(); bool worked = _variantSource.GetNextVariant(vcfVar); if (!worked) { return(new List <CalledAllele>()); } return(VcfVariantUtilities.Convert(new List <VcfVariant> { vcfVar })); }
/// <summary> /// Step 1: Load the normal het SNVs of interest. /// </summary> protected void LoadVariants(string vcfPath) { Console.WriteLine("{0} Loading variants of interest from {1}", DateTime.Now, vcfPath); this.Variants = new List<VcfVariant>(); int overallCount = 0; int countThisChromosome = 0; using (VcfReader reader = new VcfReader(vcfPath, requireGenotypes: false)) { VcfVariant variant = new VcfVariant(); while (true) { bool result = reader.GetNextVariant(out variant); if (!result) break; overallCount++; if (variant.ReferenceName != this.Chromosome) { // Shortcut: If we've seen records for the desired chromosome, then as soon as we hit another chromosome, // we can abort: if (countThisChromosome > 0) break; continue; } countThisChromosome++; // Single-allele SNVs only: if (variant.VariantAlleles.Length != 1 || variant.VariantAlleles[0].Length != 1 || variant.ReferenceAllele.Length != 1) continue; // PF variants only: if ((variant.GenotypeColumns != null && variant.GenotypeColumns.Any()) && variant.Filters != "PASS") continue; // FILTER may not say PASS for a dbSNP VCF file if (variant.GenotypeColumns != null && variant.GenotypeColumns.Any()) // not available if we use a dbSNP VCF file { if (!variant.GenotypeColumns[0].ContainsKey("GT")) continue; // no genotype - we don't know if it's a het SNV. string genotype = variant.GenotypeColumns[0]["GT"]; if (genotype != "0/1" && genotype != "1/0") continue; // Also require they have a high enough quality score: if (variant.GenotypeColumns[0].ContainsKey("GQX")) // Note: Allow no GQX field, in case we want to use another caller (e.g. Pisces) and not crash { float GQX = float.Parse(variant.GenotypeColumns[0]["GQX"]); if (GQX < 30) continue; } } // Note: Let's NOT require the variant be in dbSNP. Maybe we didn't do annotation, either because // we chose not to or because we're on a reference without annotation available. //if (variant.Identifier == ".") continue; // Remember all the variants that pass all our tests: this.Variants.Add(variant); variant = new VcfVariant(); } } Console.WriteLine("Retained {0} variants, out of {1} records for {2}", this.Variants.Count, countThisChromosome, this.Chromosome); }
public static void Recalibrate(string vcfIn, string vcfOut, string sampleCountsFileName, int baselineQNoise, double zFactor, int maxQscore, int filterQScore) { if (!File.Exists(sampleCountsFileName)) { Logger.WriteToLog("Cannot recalibrate. Cannot find {0} ", sampleCountsFileName); return; } else { Logger.WriteToLog("Found counts file: {0} ", sampleCountsFileName); } var LookupTable = GetPhredScaledCalibratedRates(baselineQNoise, zFactor, sampleCountsFileName); //if no work to do here... if ((LookupTable == null) || (LookupTable.Count == 0)) { return; } if (File.Exists(vcfOut)) { File.Delete(vcfOut); } using (VcfReader reader = new VcfReader(vcfIn)) using (StreamWriter writer = new StreamWriter(vcfOut)) { writer.NewLine = "\n"; List <string> headerLines = reader.HeaderLines; foreach (string headerLine in headerLines) { writer.WriteLine(headerLine); } var originalVar = new VcfVariant(); while (reader.GetNextVariant(originalVar)) { var cat = MutationCounter.GetMutationCategory(originalVar); if (LookupTable.ContainsKey(cat)) { UpdateVariant(maxQscore, filterQScore, LookupTable, originalVar, cat); } writer.WriteLine(originalVar); } } }
public static MutationCategory GetMutationCategory( VcfVariant consensusVariant) { if (consensusVariant.VariantAlleles.Length == 0) { return(MutationCategory.Reference); } if (consensusVariant.VariantAlleles.Length > 1) { throw new ArgumentException("This method is expecting only one variant allele per variant entry"); } int refLength = consensusVariant.ReferenceAllele.Length; int altLength = consensusVariant.VariantAlleles[0].Length; if (refLength > altLength) { return(MutationCategory.Deletion); } if (refLength < altLength) { return(MutationCategory.Insertion); } if ((refLength != 1) || (altLength != 1)) { return(MutationCategory.Other); } if ((consensusVariant.VariantAlleles[0] == ".") || (consensusVariant.VariantAlleles[0] == consensusVariant.ReferenceAllele)) { return(MutationCategory.Reference); } var EnumString = consensusVariant.ReferenceAllele + "to" + consensusVariant.VariantAlleles[0]; foreach (MutationCategory mutation in GetAllMutationCategories()) { if (EnumString.ToLower() == mutation.ToString().ToLower()) { return(mutation); } } return(MutationCategory.Other); }
private static Tuple <ushort, int, int> GetTuple(string vcfLine, ChromosomeRenamer renamer, int flankingLength = 0) { var fields = vcfLine.Split('\t'); if (fields.Length < VcfCommon.MinNumColumns) { throw new GeneralException($"Expected at least {VcfCommon.MinNumColumns} fields in the vcf string: [{vcfLine}]"); } var vcfVariant = new VcfVariant(fields, vcfLine, false); var variant = new VariantFeature(vcfVariant, renamer, new VID()); return(new Tuple <ushort, int, int>(variant.ReferenceIndex, variant.VcfReferenceBegin - flankingLength, variant.VcfReferenceEnd + flankingLength)); }
/// <summary> /// Loop over variants like this: foreach (VcfVariant variant in reader.GetVariants()) /// </summary> public IEnumerable<VcfVariant> GetVariants() { // sanity check: make sure the file is open if (!IsOpen) yield break; while (true) { // grab the next vcf line string line = Reader.ReadLine(); if (line == null) break; VcfVariant variant = new VcfVariant(); // split the columns and assign them to VcfVariant string[] cols = line.Split('\t'); // convert the columns to a variant ConvertColumnsToVariant(cols, variant); if (RequireGenotypes && (variant.Genotypes == null || variant.Genotypes.Count == 0)) throw new ApplicationException("Missing genotype columns in VCF file"); yield return variant; } }
private static void AssignVariantType(VcfVariant variant) { string genotype = null; if (variant.Genotypes[0] != null && variant.Genotypes[0].ContainsKey("GT")) { genotype = variant.Genotypes[0]["GT"]; } // sanity check: support missing genotypes if (genotype == null || genotype == "./." || genotype == ".") { variant.VarType1 = VariantType.Missing; variant.VarType2 = VariantType.Missing; return; } // Handle usual cases like 0/0, 0/1, 1/0, 1/1 as well as // special cases like ., ./., ./1, 1/.: int haplotypeA = int.TryParse(genotype.Substring(0, 1), out haplotypeA) ? haplotypeA : -1; int haplotypeB = genotype.Length >= 3 && int.TryParse(genotype.Substring(2, 1), out haplotypeB) ? haplotypeB : -1; // Treat things like ./1 or 0/. as homozygous: if (haplotypeA == -1) haplotypeA = haplotypeB; if (haplotypeB == -1) haplotypeB = haplotypeA; variant.VarType1 = GetAlleleVariantType(variant, haplotypeA); variant.VarType2 = GetAlleleVariantType(variant, haplotypeB); }
/// <summary> /// Assign a variant type to a particular allele. The rules are as follows: /// - If ref==alt, type is reference. /// - Otherwise, trim off any common prefix and any common suffix. Let |ref| denote the length of the /// reference allele after trimming, and |alt| denote the length of the alt allele after trimming. /// - If |ref|=0, it's an insertion /// - If |alt|=0, it's a deletion /// - If |ref|=|alt|=1, it's a SNV /// - If |ref| = |alt| > 1, it's a MNP /// - If |ref|>0 and |alt|>0 and |ref| != |alt|, it's a complex event /// </summary> private static VariantType GetAlleleVariantType(VcfVariant variant, int haplotype) { if (haplotype == 0) return VariantType.Reference; if (haplotype == -1) return VariantType.Missing; if (haplotype > variant.VariantAlleles.Length) { throw new Exception(string.Format("Error in variant at {0}:{1} - GT tag specifies nonexistent allele", variant.ReferenceName, variant.ReferencePosition)); } string altAllele = variant.VariantAlleles[haplotype - 1]; return GetAlleleVariantType(variant.ReferenceAllele, altAllele); }
/// <summary> /// populates a vcf variant object given an array of vcf columns /// </summary> protected void ConvertColumnsToVariant(string[] cols, VcfVariant variant) { variant.ReferenceName = cols[VcfCommon.ChromIndex]; variant.ReferencePosition = int.Parse(cols[VcfCommon.PosIndex]); variant.Identifier = cols[VcfCommon.IDIndex]; variant.ReferenceAllele = cols[VcfCommon.RefIndex]; variant.Filters = cols[VcfCommon.FilterIndex]; if (cols[VcfCommon.QualIndex] == ".") variant.HasQuality = false; double.TryParse(cols[VcfCommon.QualIndex], out variant.Quality); // CFTR uses a ".", which is not actually legal... (actually, vcf 4.1 does allow the missing value "." here. Strelka uses it) // parse the variant alleles variant.VariantAlleles = cols[VcfCommon.AltIndex].Split(','); // parse the info fields //variant.InfoFields.Clear(); variant.InfoFields = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase); string InfoData = cols[VcfCommon.InfoIndex]; if (InfoData == ".") InfoData = ""; // Special case: a "." in the INFO field should be treated like an empty string. string[] infoCols = InfoData.Split(InfoSplitChars, StringSplitOptions.RemoveEmptyEntries); int numInfoCols = infoCols.Length; if ((variant.InfoTagOrder == null) || (numInfoCols != variant.InfoTagOrder.Length)) { variant.InfoTagOrder = new string[numInfoCols]; } for (int infoColIndex = 0; infoColIndex < numInfoCols; infoColIndex++) { string infoField = infoCols[infoColIndex]; string[] infoFieldKvp = infoField.Split('='); variant.InfoTagOrder[infoColIndex] = infoFieldKvp[0]; variant.InfoFields[infoFieldKvp[0]] = (infoFieldKvp.Length == 1 ? null : infoFieldKvp[1]); } if (cols.Length > VcfCommon.GenotypeIndex) // Genotype columns present { // parse the genotype format field if (cols[VcfCommon.FormatIndex] != GenotypeTagString) { GenotypeTagString = cols[VcfCommon.FormatIndex]; GenotypeTagOrder = GenotypeTagString.Split(':'); } variant.GenotypeTagOrder = GenotypeTagOrder; // parse the genotype data for each sample variant.Genotypes = new List<Dictionary<string, string>>(); for (int sampleIndex = 0; sampleIndex < this.Samples.Count; sampleIndex++) { string genotypeColumn = cols[VcfCommon.GenotypeIndex + sampleIndex]; if (genotypeColumn == ".") { variant.Genotypes.Add(null); } else { string[] genotypeCols = genotypeColumn.Split(':'); variant.Genotypes.Add(ParseGenotype(variant.GenotypeTagOrder, genotypeCols)); } } // specify the variant type: AssignVariantType(variant); } }
protected int GetCopyNumber(VcfVariant variant, out int end) { int CN = -1; end = -1; if (variant.Genotypes != null && variant.Genotypes.Count > 0) { Dictionary<string, string> genotype = variant.Genotypes[variant.Genotypes.Count - 1]; if (genotype.ContainsKey("CN")) { CN = int.Parse(genotype["CN"]); } if (genotype.ContainsKey("END")) { end = int.Parse(genotype["END"]); } } if (variant.InfoFields.ContainsKey("END")) { end = int.Parse(variant.InfoFields["END"]); } if (variant.InfoFields.ContainsKey("CN")) { CN = int.Parse(variant.InfoFields["CN"]); } return CN; }
/// <summary> /// Retrieves the next available variant and returns false if no variants are available. /// </summary> public bool GetNextVariant(VcfVariant variant) { // sanity check: make sure the file is open if (!IsOpen) return false; // grab the next vcf line string line = Reader.ReadLine(); if (line == null) return false; // split the columns and assign them to VcfVariant string[] cols = line.Split('\t'); // convert the columns to a variant ConvertColumnsToVariant(cols, variant); if (RequireGenotypes && variant.Genotypes.Count == 0) throw new ApplicationException("Missing genotype columns in VCF file"); return true; }