public void GetPloidyFromGenotypes_DotIsIgnored() { var genotypes = new[] { Genotype.GetGenotype("."), Genotype.GetGenotype("1|2"), Genotype.GetGenotype("0/2") }; var ploidy = AlleleBlock.GetMaxPloidy(genotypes); Assert.Equal(2, ploidy); }
public void GetAlleleBlockToSampleHaplotype_AlleleBlock_WithInternalRefPositions_SplitIfOutOfRange() { var genotypeBlock1 = new GenotypeBlock(new[] { "1|2", "0/0", "0|0", "1/1" }.Select(Genotype.GetGenotype).ToArray()); var genotypeBlock2 = new GenotypeBlock(new[] { "1/1", "0|0", "1|1" }.Select(Genotype.GetGenotype).ToArray()); var genotypeBlock3 = new GenotypeBlock(new[] { "1|2", "0|0", "1|1" }.Select(Genotype.GetGenotype).ToArray(), 1); var genotypeToSample = new Dictionary <GenotypeBlock, List <int> > { { genotypeBlock1, new List <int> { 0 } }, { genotypeBlock2, new List <int> { 1 } }, { genotypeBlock3, new List <int> { 2 } } }; var indexOfUnsupportedVars = Enumerable.Repeat(new HashSet <int>(), genotypeBlock1.Genotypes.Length).ToArray(); var starts = new[] { 100, 102, 103, 104 }; var functionBlockRanges = starts.Select(x => x + 2).ToList(); var alleleBlockToSampleHaplotype = AlleleBlock.GetAlleleBlockToSampleHaplotype(genotypeToSample, indexOfUnsupportedVars, starts, functionBlockRanges, out _); var expectedBlock1 = new AlleleBlock(1, new[] { 1, 0, 1 }, 0, 0); var expectedBlock2 = new AlleleBlock(1, new[] { 2, 0, 1 }, 0, 0); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock1)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock1].SequenceEqual(new[] { new SampleHaplotype(2, 0) })); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock2)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock2].SequenceEqual(new[] { new SampleHaplotype(2, 1) })); }
public void ExtendAlleleBlock_AsExpected() { var alleleBlock1 = new AlleleBlock(2, new [] { 1, 1 }, 2, 2); var extendedBlock1 = AlleleBlockMerger.ExtendAlleleBlock(alleleBlock1, 2, 2); var expectedBlock1 = new AlleleBlock(0, new [] { 0, 0, 1, 1, 0, 0 }, -1, -1); Assert.Equal(extendedBlock1, expectedBlock1); }
private static VariantInfo GetVariantInfo(PositionSet positionSet, AlleleBlock alleleBlock) { var positions = positionSet.SimplePositions; int startIndex = alleleBlock.PositionIndex; int numPositions = alleleBlock.AlleleIndexes.Length; int numSamples = positionSet.NumSamples; string qual = GetStringWithMinValueOrDot(Enumerable.Range(startIndex, numPositions) .Select(x => positions[x].VcfFields[VcfCommon.QualIndex])); var filters = Enumerable.Range(startIndex, numPositions) .Select(i => positions[i].VcfFields[VcfCommon.FilterIndex]) .ToArray(); var gqValues = new string[numSamples]; for (var i = 0; i < numSamples; i++) { gqValues[i] = GetStringWithMinValueOrDot( new ArraySegment <string>(positionSet.GqInfo.Values[i], startIndex, numPositions).ToArray()); } var psValues = new string[numSamples]; for (var i = 0; i < numSamples; i++) { var psTagsThisSample = new ArraySegment <string>(positionSet.PsInfo.Values[i], startIndex, numPositions); var isHomozygous = new ArraySegment <bool>( positionSet.GtInfo.Values[i].Select(x => x.IsHomozygous).ToArray(), startIndex, numPositions); psValues[i] = GetPhaseSetForRecomposedVariant(psTagsThisSample, isHomozygous); } var homoReferenceSamplePloidy = new int?[numSamples]; for (var i = 0; i < numSamples; i++) { if (Genotype.IsAllHomozygousReference(positionSet.GtInfo.Values[i], startIndex, numPositions)) { homoReferenceSamplePloidy[i] = positionSet.GtInfo.Values[i][startIndex].AlleleIndexes.Length; } } var sampleFilters = new List <bool> [numSamples]; for (var i = 0; i < numSamples; i++) { sampleFilters[i] = new List <bool>(); } return(new VariantInfo(qual, filters, gqValues, psValues, homoReferenceSamplePloidy, sampleFilters)); }
public void GetAlleleBlockToSampleHaplotype_AwareOfTrimmedRefPositions() { var genotypeBlock1 = new GenotypeBlock(new[] { "0|0", "1|1", "1|1", "0|0" }.Select(Genotype.GetGenotype).ToArray()); var genotypeBlock2 = new GenotypeBlock(new[] { "0|0", "1|1", "1|1" }.Select(Genotype.GetGenotype).ToArray()); var genotypeBlock3 = new GenotypeBlock(new[] { "1|1", "1|1", "0|0" }.Select(Genotype.GetGenotype).ToArray(), 1); var genotypeBlock4 = new GenotypeBlock(new[] { "1|1", "1|1" }.Select(Genotype.GetGenotype).ToArray(), 1); var genotypeToSample = new Dictionary <GenotypeBlock, List <int> > { { genotypeBlock1, new List <int> { 0 } }, { genotypeBlock2, new List <int> { 1 } }, { genotypeBlock3, new List <int> { 2 } }, { genotypeBlock4, new List <int> { 3 } } }; var indexOfUnsupportedVars = Enumerable.Repeat(new HashSet <int>(), 4).ToArray(); var starts = Enumerable.Range(100, 4).ToArray(); var functionBlockRanges = starts.Select(x => x + 2).ToList(); var alleleBlockToSampleHaplotype = AlleleBlock.GetAlleleBlockToSampleHaplotype(genotypeToSample, indexOfUnsupportedVars, starts, functionBlockRanges, out _); var expectedBlock1 = new AlleleBlock(1, new[] { 1, 1 }, 1, 1); var expectedBlock2 = new AlleleBlock(1, new[] { 1, 1 }, 1, 0); var expectedBlock3 = new AlleleBlock(1, new[] { 1, 1 }, 0, 1); var expectedBlock4 = new AlleleBlock(1, new[] { 1, 1 }, 0, 0); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock1)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock1] .SequenceEqual(new[] { new SampleHaplotype(0, 0), new SampleHaplotype(0, 1) })); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock2)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock2] .SequenceEqual(new[] { new SampleHaplotype(1, 0), new SampleHaplotype(1, 1) })); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock3)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock3] .SequenceEqual(new[] { new SampleHaplotype(2, 0), new SampleHaplotype(2, 1) })); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock4)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock4] .SequenceEqual(new[] { new SampleHaplotype(3, 0), new SampleHaplotype(3, 1) })); }
public void GetAlleleBlockToSampleHaplotype_AlleleBlock_OneAlleleIsRef_EachTime() { var genotypeBlock1 = new GenotypeBlock(new[] { "1|0", "0|1", "1|0", "0|1" }.Select(Genotype.GetGenotype).ToArray()); var genotypeBlock2 = new GenotypeBlock(new[] { "1/1", "0|1", "1|0" }.Select(Genotype.GetGenotype).ToArray(), 1); var genotypeBlock3 = new GenotypeBlock(new[] { "0|0", "1|0", "0|1", "0|0" }.Select(Genotype.GetGenotype).ToArray()); var genotypeBlock4 = new GenotypeBlock(new[] { "0|1", "1|0", "1|0" }.Select(Genotype.GetGenotype).ToArray()); var genotypeToSample = new Dictionary <GenotypeBlock, List <int> > { { genotypeBlock1, new List <int> { 0 } }, { genotypeBlock2, new List <int> { 1 } }, { genotypeBlock3, new List <int> { 2 } }, { genotypeBlock4, new List <int> { 3 } } }; var indexOfUnsupportedVars = Enumerable.Repeat(new HashSet <int>(), genotypeBlock1.Genotypes.Length).ToArray(); var starts = new[] { 100, 101, 102, 104 }; var functionBlockRanges = starts.Select(x => x + 2).ToList(); var alleleBlockToSampleHaplotype = AlleleBlock.GetAlleleBlockToSampleHaplotype(genotypeToSample, indexOfUnsupportedVars, starts, functionBlockRanges, out _); var expectedBlock1 = new AlleleBlock(0, new[] { 1, 0, 1 }, 0, 0); var expectedBlock2 = new AlleleBlock(0, new[] { 0, 1, 0 }, 0, 0); var expectedBlock3 = new AlleleBlock(1, new[] { 1, 0 }, 0, 0); var expectedBlock4 = new AlleleBlock(1, new[] { 1, 1 }, 0, 0); var expectedBlock5 = new AlleleBlock(1, new[] { 0, 0 }, 0, 0); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock1)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock1].SequenceEqual(new[] { new SampleHaplotype(0, 0) })); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock2)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock2].SequenceEqual(new[] { new SampleHaplotype(0, 1) })); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock3)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock3].SequenceEqual(new[] { new SampleHaplotype(1, 0) })); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock4)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock4].SequenceEqual(new[] { new SampleHaplotype(1, 1), new SampleHaplotype(3, 0) })); Assert.True(alleleBlockToSampleHaplotype.ContainsKey(expectedBlock5)); Assert.True(alleleBlockToSampleHaplotype[expectedBlock5].SequenceEqual(new[] { new SampleHaplotype(3, 1) })); }
public void GetPositionsAndRefAltAlleles_AsExpected() { var genotypeBlock = new GenotypeBlock(new[] { "1|2", "1/1", "0|1", "0/1" }.Select(Genotype.GetGenotype).ToArray()); var genotypeToSample = new Dictionary <GenotypeBlock, List <int> > { { genotypeBlock, new List <int> { 0 } } }; var indexOfUnsupportedVars = Enumerable.Repeat(new HashSet <int>(), genotypeBlock.Genotypes.Length).ToArray(); var starts = new[] { 356, 358, 360, 361 }; var functionBlockRanges = new List <int> { 358, 360, 362, 364 }; var alleles = new[] { new[] { "G", "C", "T" }, new[] { "A", "T" }, new[] { "C", "G" }, new[] { "G", "T" } }; const string refSequence = "GAATCG"; var alleleBlockToSampleHaplotype = AlleleBlock.GetAlleleBlockToSampleHaplotype(genotypeToSample, indexOfUnsupportedVars, starts, functionBlockRanges, out var alleleBlockGraph); var mergedAlleleBlockToSampleHaplotype = AlleleBlockMerger.Merge(alleleBlockToSampleHaplotype, alleleBlockGraph).ToArray(); var alleleSet = new AlleleSet(ChromosomeUtilities.Chr1, starts, alleles); var alleleBlocks = mergedAlleleBlockToSampleHaplotype.Select(x => x.Key).ToArray(); var sequence = new NSequence(); var result1 = VariantGenerator.GetPositionsAndRefAltAlleles(alleleBlocks[0], alleleSet, refSequence, starts[0], null, sequence, _vidCreator); var result2 = VariantGenerator.GetPositionsAndRefAltAlleles(alleleBlocks[1], alleleSet, refSequence, starts[0], null, sequence, _vidCreator); var expectedVarPosIndexes1 = new List <int> { 0, 1 }; var expectedVarPosIndexes2 = new List <int> { 0, 1, 2 }; Assert.Equal((356, 360, "GAATC", "CATTC"), (result1.Start, result1.End, result1.Ref, result1.Alt)); for (var i = 0; i < expectedVarPosIndexes1.Count; i++) { Assert.Equal(expectedVarPosIndexes1[i], result1.VarPosIndexesInAlleleBlock[i]); } Assert.Equal((356, 360, "GAATC", "TATTG"), (result2.Start, result2.End, result2.Ref, result2.Alt)); for (var i = 0; i < expectedVarPosIndexes2.Count; i++) { Assert.Equal(expectedVarPosIndexes2[i], result2.VarPosIndexesInAlleleBlock[i]); } }
decomposedVids) GetPositionsAndRefAltAlleles(AlleleBlock alleleBlock, AlleleSet alleleSet, string totalRefSequence, int regionStart, List <ISimplePosition> simplePositions, ISequence sequence, IVariantIdCreator vidCreator) { int numPositions = alleleBlock.AlleleIndexes.Length; int firstPositionIndex = alleleBlock.PositionIndex; int lastPositionIndex = alleleBlock.PositionIndex + numPositions - 1; int blockStart = alleleSet.Starts[firstPositionIndex]; int blockEnd = alleleSet.Starts[lastPositionIndex]; string lastRefAllele = alleleSet.VariantArrays[lastPositionIndex][0]; int blockRefLength = blockEnd - blockStart + lastRefAllele.Length; string refSequence = totalRefSequence.Substring(blockStart - regionStart, blockRefLength); var refSequenceStart = 0; var altSequenceSegments = new LinkedList <string>(); var variantPosIndexesInAlleleBlock = new List <int>(); var vidListsNeedUpdate = new List <List <string> >(); var decomposedVids = new List <string>(); if (FindConflictAllele(alleleBlock, alleleSet)) { return(default);
public void Merge_AsExpected() { var genotypeBlock1 = new GenotypeBlock(new[] { "1|0", "1|1", "1|1" }.Select(Genotype.GetGenotype).ToArray()); var genotypeBlock2 = new GenotypeBlock(new[] { "0|0", "1|0", "1|1" }.Select(Genotype.GetGenotype).ToArray()); var genotypeToSample = new Dictionary <GenotypeBlock, List <int> > { { genotypeBlock1, new List <int> { 0 } }, { genotypeBlock2, new List <int> { 1 } } }; var indexOfUnsupportedVars = Enumerable.Repeat(new HashSet <int>(), 3).ToArray(); var starts = Enumerable.Range(100, 3).ToArray(); var functionBlockRanges = starts.Select(x => x + 2).ToList(); var alleleBlockToSampleHaplotype = AlleleBlock.GetAlleleBlockToSampleHaplotype(genotypeToSample, indexOfUnsupportedVars, starts, functionBlockRanges, out var alleleBlockGraph); var mergedAlleleBlockToSampleHaplotype = AlleleBlockMerger.Merge(alleleBlockToSampleHaplotype, alleleBlockGraph); var expectedBlock1 = new AlleleBlock(0, new[] { 1, 1, 1 }, -1, -1); var expectedBlock2 = new AlleleBlock(0, new[] { 0, 1, 1 }, -1, -1); var expectedBlock3 = new AlleleBlock(0, new[] { 0, 0, 1 }, -1, -1); Assert.True(mergedAlleleBlockToSampleHaplotype.ContainsKey(expectedBlock1)); Assert.True(mergedAlleleBlockToSampleHaplotype[expectedBlock1] .SequenceEqual(new[] { new SampleHaplotype(0, 0) })); Assert.True(mergedAlleleBlockToSampleHaplotype.ContainsKey(expectedBlock2)); Assert.True(mergedAlleleBlockToSampleHaplotype[expectedBlock2] .SequenceEqual(new[] { new SampleHaplotype(0, 1), new SampleHaplotype(1, 0) })); Assert.True(mergedAlleleBlockToSampleHaplotype.ContainsKey(expectedBlock3)); Assert.True(mergedAlleleBlockToSampleHaplotype[expectedBlock3] .SequenceEqual(new[] { new SampleHaplotype(1, 1) })); }
internal static (int Start, int End, string Ref, string Alt, List <int> VarPosIndexesInAlleleBlock, List <string> decomposedVids) GetPositionsAndRefAltAlleles(AlleleBlock alleleBlock, AlleleSet alleleSet, string totalRefSequence, int regionStart, List <ISimplePosition> simplePositions) { int numPositions = alleleBlock.AlleleIndexes.Length; int firstPositionIndex = alleleBlock.PositionIndex; int lastPositionIndex = alleleBlock.PositionIndex + numPositions - 1; int blockStart = alleleSet.Starts[firstPositionIndex]; int blockEnd = alleleSet.Starts[lastPositionIndex]; string lastRefAllele = alleleSet.VariantArrays[lastPositionIndex][0]; int blockRefLength = blockEnd - blockStart + lastRefAllele.Length; var refSequence = totalRefSequence.Substring(blockStart - regionStart, blockRefLength); int refSequenceStart = 0; var altSequenceSegments = new LinkedList <string>(); var variantPosIndexesInAlleleBlock = new List <int>(); var vidListsNeedUpdate = new List <List <string> >(); var decomposedVids = new List <string>(); for (int positionIndex = firstPositionIndex; positionIndex <= lastPositionIndex; positionIndex++) { int indexInBlock = positionIndex - firstPositionIndex; int alleleIndex = alleleBlock.AlleleIndexes[indexInBlock]; //only non-reference alleles considered if (alleleIndex == 0) { continue; } variantPosIndexesInAlleleBlock.Add(positionIndex - firstPositionIndex); string refAllele = alleleSet.VariantArrays[positionIndex][0]; string altAllele = alleleSet.VariantArrays[positionIndex][alleleIndex]; int positionOnRefSequence = alleleSet.Starts[positionIndex] - blockStart; int refRegionBetweenTwoAltAlleles = positionOnRefSequence - refSequenceStart; if (refRegionBetweenTwoAltAlleles < 0) { string previousAltAllele = alleleSet.VariantArrays[positionIndex - 1][alleleIndex]; throw new UserErrorException($"Conflicting alternative alleles identified at {alleleSet.Chromosome.UcscName}:{alleleSet.Starts[positionIndex]}: both \"{previousAltAllele}\" and \"{altAllele}\" are present."); } string refSequenceBefore = refSequence.Substring(refSequenceStart, refRegionBetweenTwoAltAlleles); altSequenceSegments.AddLast(refSequenceBefore); altSequenceSegments.AddLast(altAllele); refSequenceStart = positionOnRefSequence + refAllele.Length; if (simplePositions == null) { continue; } var thisPosition = simplePositions[positionIndex]; // alleleIndex is 1-based for altAlleles int varIndex = alleleIndex - 1; //Only SNVs get recomposed for now if (thisPosition.Vids[varIndex] == null) { thisPosition.Vids[varIndex] = SmallVariantCreator.GetVid(alleleSet.Chromosome.EnsemblName, thisPosition.Start, thisPosition.End, thisPosition.AltAlleles[varIndex], VariantType.SNV); thisPosition.IsDecomposed[varIndex] = true; } decomposedVids.Add(thisPosition.Vids[varIndex]); if (thisPosition.LinkedVids[varIndex] == null) { thisPosition.LinkedVids[varIndex] = new List <string>(); } vidListsNeedUpdate.Add(thisPosition.LinkedVids[varIndex]); } altSequenceSegments.AddLast(refSequence.Substring(refSequenceStart)); var recomposedAllele = string.Concat(altSequenceSegments); var blockRefEnd = blockStart + blockRefLength - 1; var recomposedVariantId = SmallVariantCreator.GetVid(alleleSet.Chromosome.EnsemblName, blockStart, blockRefEnd, recomposedAllele, VariantType.MNV); vidListsNeedUpdate.ForEach(x => x.Add(recomposedVariantId)); return(blockStart, blockRefEnd, refSequence, recomposedAllele, variantPosIndexesInAlleleBlock, decomposedVids); }