public void MultiIndelContainsIndel()
        {
            var indel1 = new PreIndel(new CandidateAllele("chr1", 100, "A", "ATC", AlleleCategory.Insertion));

            indel1.InMulti    = true;
            indel1.OtherIndel = "chr1:105 AT>A";
            var indel2 = new PreIndel(new CandidateAllele("chr1", 105, "AT", "A", AlleleCategory.Deletion));

            var otherMultiIndel = new PreIndel(new CandidateAllele("chr1", 100, "A", "ATC", AlleleCategory.Insertion));

            otherMultiIndel.InMulti    = true;
            otherMultiIndel.OtherIndel = "chr1:107 AT>A";

            var multiWithIndel2AsPrimary = new PreIndel(new CandidateAllele("chr1", 105, "AT", "A", AlleleCategory.Deletion));

            multiWithIndel2AsPrimary.InMulti    = true;
            multiWithIndel2AsPrimary.OtherIndel = "chr1:100 AT>A";


            Assert.True(Helper.MultiIndelContainsIndel(indel1, indel2));
            Assert.False(Helper.MultiIndelContainsIndel(otherMultiIndel, indel2));
            Assert.True(Helper.MultiIndelContainsIndel(multiWithIndel2AsPrimary, indel2));

            var nonMultiIndel = new PreIndel(new CandidateAllele("chr1", 100, "A", "ATC", AlleleCategory.Insertion));

            Assert.Throws <ArgumentException>(() => { Helper.MultiIndelContainsIndel(nonMultiIndel, indel2); });
        }
Example #2
0
        private static PreIndel GetIndelFromEntry(PreIndel indel, int anchorLeft, int anchorRight,
                                                  int observationCount, int mess, float fwdSupport, float reverseSupport, float reputableSupport,
                                                  float avgQuals, float stitchedSupport, int numFromMateUnmapped, int numFromUnanchoredRepeat)
        {
            var averageAnchor = (anchorLeft + anchorRight) /
                                observationCount;
            var averageMess = (mess / (float)observationCount);

            var balance = Math.Max(1,
                                   (fwdSupport >= reverseSupport ? reverseSupport / fwdSupport : fwdSupport / reverseSupport) + stitchedSupport);

            balance = fwdSupport >= reverseSupport ? fwdSupport / (Math.Max(1, reverseSupport)) : reverseSupport / Math.Max(1, fwdSupport);

            // Still want to care about absolute anchor lengths (maybe reads are variable lengths), but also definitely need to care about anchor balance (3 reads at 90/10 would have same avg anchor as 3 reads at 50/50)
            var anchorBalance = Math.Max(1, (anchorLeft >= anchorRight ? (anchorLeft / (float)anchorRight) : (anchorRight / (float)anchorLeft)));

            anchorBalance = anchorLeft >= anchorRight
                ? anchorLeft / (float)(Math.Max(1, anchorRight)) : anchorRight / (float)(Math.Max(1, anchorLeft));

            var averageCleanAnchor = (averageAnchor - averageMess) / (float)averageAnchor;

            indel.Observations = observationCount;
            indel.Score        = (int)(Math.Max(0, (int)(observationCount * (1 / balance) * (1 / anchorBalance) * (1 + reputableSupport + (stitchedSupport / balance)) * (avgQuals / 30) * averageCleanAnchor * 10)) * (1 + (indel.Length / 5)) * ((observationCount - numFromMateUnmapped - numFromUnanchoredRepeat) / (float)observationCount));

            return(indel);
        }
Example #3
0
        private void RecalculateIndelAndAddIfNeeded(bool allowRescue, KeyValuePair <HashableIndel, List <IndelEvidence> > indelToRecalculate,
                                                    IndelStatusCounter statusCounter, double edgeThreshold, List <PreIndel> indelsToAdd)
        {
            var hashable = indelToRecalculate.Key;
            var indel    = new PreIndel(new CandidateAllele(hashable.Chromosome, hashable.ReferencePosition,
                                                            hashable.ReferenceAllele, hashable.AlternateAllele, hashable.Type));

            indel.InMulti    = hashable.InMulti;
            indel.OtherIndel = hashable.OtherIndel;

            var metrics = new IndelEvidence();

            foreach (var metricsList in indelToRecalculate.Value)
            {
                metrics.AddIndelEvidence(metricsList);
            }

            var entryIndels = ExtractIndelsFromEntry(metrics, indel.ToString() + "|" + indel.OtherIndel,
                                                     statusCounter, edgeThreshold, allowRescue, new List <PreIndel>()
            {
                indel
            });

            if (entryIndels != null)
            {
                indelsToAdd.AddRange(entryIndels);
            }
        }
Example #4
0
        private static string ActualReferenceAllele(GenomeSnippet snippet, PreIndel preIndel, int contextStart)
        {
            var actualReferenceAllele = snippet.Sequence.Substring(
                preIndel.ReferencePosition - 1 - contextStart, preIndel.ReferenceAllele.Length);

            return(actualReferenceAllele);
        }
Example #5
0
 private static void AddIndelMetrics(IndelEvidence indelMetrics, PreIndel indel, int totalNm)
 {
     indelMetrics.Position     = indel.ReferencePosition;
     indelMetrics.LeftAnchor  += indel.LeftAnchor;
     indelMetrics.RightAnchor += indel.RightAnchor;
     indelMetrics.Mess        += Math.Max(0, totalNm - indel.Length);
     indelMetrics.Quality     += indel.AverageQualityRounded;
 }
Example #6
0
        private static string ReferenceSuffix(GenomeSnippet snippet, PreIndel preIndel, int contextStart)
        {
            var offset         = Math.Max(10, 3 * preIndel.Length);
            var prefixSequence = snippet.Sequence.Substring(
                preIndel.ReferencePosition + preIndel.ReferenceAllele.Length - 1 - contextStart, offset);

            return(prefixSequence);
        }
        public void CandidateToString()
        {
            var preIndel = new PreIndel(new CandidateAllele("chr1", 123, "A", "ATC", AlleleCategory.Insertion));

            Assert.Equal("chr1:123 A>ATC", Helper.CandidateToString(preIndel));

            preIndel = new PreIndel(new CandidateAllele("chr1", 456, "ATC", "A", AlleleCategory.Deletion));
            Assert.Equal("chr1:456 ATC>A", Helper.CandidateToString(preIndel));
        }
Example #8
0
        private static string ReferencePrefix(GenomeSnippet snippet, PreIndel preIndel, int contextStart)
        {
            var offset         = Math.Max(10, 3 * preIndel.Length);
            var prefixStart    = Math.Max(0, preIndel.ReferencePosition - 1 - contextStart - offset - 1);
            var prefixLength   = preIndel.ReferencePosition - prefixStart;
            var prefixSequence = snippet.Sequence.Substring(prefixStart, prefixLength);

            return(prefixSequence);
        }
Example #9
0
        private static string ActualAltAllele(PreIndel preIndel, string actualReferenceAllele)
        {
            var actualAltAllele =
                actualReferenceAllele.Length == 1
                    ? actualReferenceAllele +
                preIndel.AlternateAllele.Substring(1)
                    : actualReferenceAllele[0].ToString();

            return(actualAltAllele);
        }
Example #10
0
        private static bool IndelsMatch(PreIndel indel1, PreIndel indel2)
        {
            if (indel1.Chromosome == indel2.Chromosome &&
                indel1.ReferencePosition == indel2.ReferencePosition &&
                indel1.ReferenceAllele == indel2.ReferenceAllele && indel1.AlternateAllele == indel2.AlternateAllele)
            {
                return(true);
            }

            return(false);
        }
Example #11
0
        private static void UpdateIndelMetrics(BamAlignment bamAlignment, bool isReputable, bool stitched, IndelEvidence indelMetrics,
                                               PreIndel indel, int totalNm)
        {
            // TODO - are read-level repeats that informative? Because this is kind of a perf burden
            // (^ Removed for now for that reason)
            bool isRepeat = false;

            //var isRepeat = StitchingLogic.OverlapEvaluator.IsRepeat(bamAlignment.Bases.Substring(0, (int)indel.LeftAnchor), 2, out repeatUnit) || StitchingLogic.OverlapEvaluator.IsRepeat(bamAlignment.Bases.Substring(0, (int)indel.RightAnchor), 2, out repeatUnit);

            AddReadLevelIndelMetrics(bamAlignment, isReputable, stitched, indelMetrics, isRepeat);
            AddIndelMetrics(indelMetrics, indel, totalNm);
        }
Example #12
0
        public static bool IsMatch(PreIndel pre, HashableIndel hashable)
        {
            var equivPosition = pre.Chromosome == hashable.Chromosome &&
                                pre.ReferencePosition == hashable.ReferencePosition;

            if (!equivPosition)
            {
                return(false);
            }

            var equivAlleles = pre.Type == AlleleCategory.Insertion ? InsertionsAreMatch(pre.AlternateAllele, hashable.AlternateAllele):
                               pre.ReferenceAllele.Length == hashable.ReferenceAllele.Length;

            return(equivAlleles);
        }
Example #13
0
        // TODO EXTRACT TO SHARED
        private static PreIndel GetIndelKey(string splittedIndel)
        {
            var splitString = splittedIndel.Split(' ').SelectMany(x => x.Split(':', '>')).ToList();
            var chrom       = splitString[0];
            var pos         = int.Parse(splitString[1]);
            var refAllele   = splitString[2];
            var altAllele   = splitString[3];

            var indel = new PreIndel(new CandidateAllele(chrom, pos, refAllele, altAllele,
                                                         refAllele.Length > altAllele.Length ? AlleleCategory.Deletion : AlleleCategory.Insertion)
            {
            });

            return(indel);
        }
Example #14
0
        public static bool MultiIndelContainsIndel(PreIndel multiIndel, PreIndel singleIndel)
        {
            if (!multiIndel.InMulti || singleIndel.InMulti)
            {
                throw new ArgumentException("Not looking at a single and a multi.");
            }

            var singleToString = CandidateToString(singleIndel);

            if (multiIndel.OtherIndel == singleToString || CandidateToString(multiIndel) == singleToString)
            {
                return(true);
            }

            return(false);
        }
Example #15
0
        private static HashableIndel GetHashableIndel(PreIndel preIndel, int score = 0)
        {
            var indelIdentifier = new HashableIndel
            {
                Chromosome        = preIndel.Chromosome,
                ReferencePosition = preIndel.ReferencePosition,
                ReferenceAllele   = preIndel.ReferenceAllele,
                AlternateAllele   = preIndel.AlternateAllele,
                Type = preIndel.ReferenceAllele.Length > preIndel.AlternateAllele.Length
                    ? AlleleCategory.Deletion
                    : AlleleCategory.Insertion,
                Length     = Math.Abs(preIndel.ReferenceAllele.Length - preIndel.AlternateAllele.Length),
                Score      = score,
                InMulti    = preIndel.InMulti,
                OtherIndel = preIndel.OtherIndel
            };

            return(Helper.CopyHashable(indelIdentifier));
        }
Example #16
0
        public void GetHashableIndel()
        {
            var refSequence = "ZZXXXXXCAGCAGCAGCAGXYZ";

            var indel = new PreIndel(new CandidateAllele("chr1", 7, "XCAG", "X", AlleleCategory.Deletion));

            var genomeSnippet = new GenomeSnippet()
            {
                Chromosome    = "chr1",
                Sequence      = refSequence + "TTTTT",
                StartPosition = 0
            };

            var hashable = HashableIndelSource.GetHashableIndel(genomeSnippet, indel, 0, false);

            Assert.Equal("ZZXXXXX", hashable.RefPrefix);
            Assert.Equal("CAGCAGCAGX", hashable.RefSuffix);

            indel    = new PreIndel(new CandidateAllele("chr1", 7, "X", "XCAG", AlleleCategory.Insertion));
            hashable = HashableIndelSource.GetHashableIndel(genomeSnippet, indel, 0, false);
            Assert.Equal("ZZXXXXX", hashable.RefPrefix);
            Assert.Equal("CAGCAGCAGC", hashable.RefSuffix);
        }
 private bool IsMultiMatch(HashableIndel hashable, PreIndel indel)
 {
     // TODO shouldn't this also check the normal indel?
     return(hashable.InMulti && Helper.CandidateToString(indel) == hashable.OtherIndel);
 }
Example #18
0
 public static string CandidateToString(PreIndel indel)
 {
     return(indel.Chromosome + ":" + indel.ReferencePosition + " " + indel.ReferenceAllele + ">" + indel.AlternateAllele);
 }
        public List <PreIndel> FindIndels(BamAlignment read, string chromosomeName, int minBaseCallQuality = 10)
        {
            var candidates = new List <PreIndel>();

            var startIndexInRead      = 0;
            var startIndexInReference = read.Position;

            for (var cigarOpIndex = 0; cigarOpIndex < read.CigarData.Count; cigarOpIndex++)
            {
                var operation = read.CigarData[cigarOpIndex];
                switch (operation.Type)
                {
                case 'I':
                    var insertionQualityGoodEnough = true;
                    var totalQualities             = 0;
                    var qualitiesNotGoodEnough     = 0;
                    for (int i = 0; i < operation.Length; i++)
                    {
                        var indexInRead = startIndexInRead + i;
                        if (indexInRead > read.Qualities.Length - 1)
                        {
                            // TODO invalid, throw?
                            break;
                        }
                        var qualAtBase = read.Qualities[indexInRead];
                        totalQualities += (int)qualAtBase;

                        if (qualAtBase < minBaseCallQuality)
                        {
                            qualitiesNotGoodEnough++;
                        }
                    }

                    if (qualitiesNotGoodEnough / (float)operation.Length > 0.1)
                    {
                        insertionQualityGoodEnough = false;
                    }

                    // TODO check whether positions are off by one

                    var referenceBase = "N";
                    var insertion     = new PreIndel(new CandidateAllele(chromosomeName, startIndexInReference, referenceBase, referenceBase + read.Bases.Substring(startIndexInRead, (int)operation.Length), AlleleCategory.Insertion));
                    if (insertion != null && insertionQualityGoodEnough)
                    {
                        candidates.Add(new PreIndel(insertion)
                        {
                            LeftAnchor            = (int)(cigarOpIndex > 0 && read.CigarData[cigarOpIndex - 1].Type == 'M' ? read.CigarData[cigarOpIndex - 1].Length : 0),
                            RightAnchor           = (int)(cigarOpIndex < read.CigarData.Count - 1 && read.CigarData[cigarOpIndex + 1].Type == 'M' ? read.CigarData[cigarOpIndex + 1].Length : 0),
                            AverageQualityRounded = totalQualities / (int)operation.Length     // Loss of fraction here is ok
                        });
                    }
                    break;

                case 'D':
                    // Note that this checks both the quality of the base preceding the deletion and the base after the deletion, or if there is no base after the deletion, counts that as low quality.
                    var deletionQualityGoodEnough = read.Qualities[startIndexInRead] >= minBaseCallQuality &&
                                                    (startIndexInRead + 1 < read.Qualities.Length && read.Qualities[startIndexInRead + 1] >= minBaseCallQuality);

                    // TODO this is not legit for the ref bases but going to do this for now. May not even really need to care about the reference base, honestly.
                    var referenceBases = new string('N', (int)operation.Length + 1);
                    var deletion       = new PreIndel(new CandidateAllele(chromosomeName, startIndexInReference, referenceBases, "N", AlleleCategory.Deletion));
                    if (deletion != null && deletionQualityGoodEnough)
                    {
                        candidates.Add(new PreIndel(deletion)
                        {
                            LeftAnchor            = (int)(cigarOpIndex > 0 && read.CigarData[cigarOpIndex - 1].Type == 'M' ? read.CigarData[cigarOpIndex - 1].Length : 0),
                            RightAnchor           = (int)(cigarOpIndex < read.CigarData.Count - 1 && read.CigarData[cigarOpIndex + 1].Type == 'M' ? read.CigarData[cigarOpIndex + 1].Length : 0),
                            AverageQualityRounded = (read.Qualities[startIndexInRead] + (read.Qualities.Length > startIndexInRead + 2 ? read.Qualities[startIndexInRead + 1] : 0)) / 2
                        });
                    }
                    break;

                default:
                    break;
                }

                if (operation.IsReadSpan())
                {
                    startIndexInRead += (int)operation.Length;
                }

                if (operation.IsReferenceSpan())
                {
                    startIndexInReference += (int)operation.Length;
                }
            }

            return(candidates);
        }
Example #20
0
        public void GetFinalIndelsForChromosome()
        {
            var preIndels  = new List <PreIndel>();
            var insertion1 = new PreIndel(new CandidateAllele("chr1", 100, "N", "NGA", AlleleCategory.Insertion));

            insertion1.Score = 100;
            var deletion = new PreIndel(new CandidateAllele("chr1", 5, "NNNN", "N", AlleleCategory.Deletion));

            deletion.Score = 100;
            var insertionSimilarToIns1 = new PreIndel(new CandidateAllele("chr1", 100, "N", "NGC", AlleleCategory.Insertion));

            insertionSimilarToIns1.Score = 20;
            var insertion2 = new PreIndel(new CandidateAllele("chr1", 302, "N", "NTCATCA", AlleleCategory.Insertion));

            insertion2.Score = 100;
            var insertionSimilarConsequenceToIns2 = new PreIndel(new CandidateAllele("chr1", 305, "N", "NTCATGA", AlleleCategory.Insertion));

            insertionSimilarConsequenceToIns2.Score = 20;
            var insertionNotSimilarEnoughConsequenceToIns2 = new PreIndel(new CandidateAllele("chr1", 305, "N", "NTCAGTA", AlleleCategory.Insertion));

            insertionNotSimilarEnoughConsequenceToIns2.Score = 20;
            var insertionContainingInsertion2 = new PreIndel(new CandidateAllele("chr1", 302, "N", "NTCATCATCATCA", AlleleCategory.Insertion));

            insertionContainingInsertion2.Score = 20;
            // TODO add edge cases in terms of score, negative cases in terms of diffferent variant types

            preIndels = new List <PreIndel>()
            {
                deletion, insertion1, insertionSimilarToIns1,
                insertion2, insertionSimilarConsequenceToIns2, insertionNotSimilarEnoughConsequenceToIns2,
                insertionContainingInsertion2
            };

            // insertionSimilarToIns1 is removed for being very similar to insertion 1 and much lower quality
            // insertionSimilarConsequenceToIns2 is removed for having almost the exact same consequence as insertion 2 and much lower quality
            // insertionNotSimilarEnoughConsequenceToIns2 is pretty close to insertion 2 in terms of consequence, and weaker, but not similar enough, so can stay
            // insertionContainingInsertion2 has exact same nearby consequence and position as insertion 2 but it is hard to call, being a long dup. so it gets to stay.

            var indelSource  = new HashableIndelSource();
            var chrReference = new ChrReference()
            {
                FastaPath = "abc", Name = "chr1",
                Sequence  = new string('A', 99) + new string('T', 5) + new string('C', 195) +
                            //299
                            string.Join("", Enumerable.Repeat("TCA", 20)) + new string('G', 300)
            };

            var finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference);

            // Rehydrate with reference sequence and keep the right ones
            Assert.Equal(5, finalIndels.Count);
            EnsureIndelNotPresent(finalIndels, insertionSimilarToIns1.ReferencePosition, "A", "AGC");
            EnsureIndelNotPresent(finalIndels, insertionSimilarConsequenceToIns2.ReferencePosition, "A", "ATCATGA");
            var ins1 = CheckForIndel(finalIndels, 100, "T", "TGA", 100);

            Assert.False(ins1.IsDuplication);
            Assert.False(ins1.IsRepeat);
            var del = CheckForIndel(finalIndels, 5, "AAAA", "A", 100);

            Assert.False(del.IsDuplication);
            Assert.True(del.IsRepeat);
            var ins2 = CheckForIndel(finalIndels, 302, "A", "ATCATCA", 100);

            Assert.True(ins2.IsRepeat);
            Assert.True(ins2.IsDuplication);
            var ins2NotSimilarEnough = CheckForIndel(finalIndels, 305, "A", "ATCAGTA", 20);

            Assert.True(ins2NotSimilarEnough.IsRepeat);
            Assert.False(ins2NotSimilarEnough.IsDuplication);
            var longerInsertion = CheckForIndel(finalIndels, 302, "A", "ATCATCATCATCA", 20);

            Assert.True(longerInsertion.IsRepeat);
            Assert.True(longerInsertion.IsDuplication);
            Assert.True(longerInsertion.HardToCall);

            // Should handle scenario of stutter
            //         012345678901234567890
            // ...CCCCCCGGGGGTTTTTAAAAATATATA
            //              *ins TGG
            //          *ins GGG
            // ...CCCCCCGGGGGTGGTTTTTAAAAATATATA
            // ...CCCCCCGGGGGGGGTTTTTAAAAATATATA
            var homopolymerIns = new PreIndel(new CandidateAllele("chr1", 300, "N", "NGGG", AlleleCategory.Insertion));

            homopolymerIns.Score = 100;
            var homopolymerInsWithStutter = new PreIndel(new CandidateAllele("chr1", 305, "N", "NTGG", AlleleCategory.Insertion));

            homopolymerInsWithStutter.Score = 10;
            preIndels = new List <PreIndel>()
            {
                homopolymerIns, homopolymerInsWithStutter
            };

            indelSource  = new HashableIndelSource();
            chrReference = new ChrReference()
            {
                FastaPath = "abc",
                Name      = "chr1",
                Sequence  = new string('C', 300) + "GGGGGTTTTTAAAAATATATA" + new string('G', 300)
            };
            finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference);
            Assert.Equal(1, finalIndels.Count);

            //chr1: 125080780 N > NTTTGATTCCATTCGATGATCACTACATTCAGTTCCATTCAATGATGATTCCAACAGATTCCATTTGGTGACTCCATTCGATTCTATTCATTGATGATTCCA
            //chr1: 125080854 N > NATTCGATTCTATTCATTGATGATTCCATTTGATTCCATTCGATGATGACTGCCTTCAGTTCCATTCGGTGATGATTCCAACAGATTCCATTTGGTGACTCA
            var realLongIns1 = new PreIndel(new CandidateAllele("chr1", 780, "N", "NTTTGATTCCATTCGATGATCACTACATTCAGTTCCATTCAATGATGATTCCAACAGATTCCATTTGGTGACTCCATTCGATTCTATTCATTGATGATTCCA", AlleleCategory.Insertion));

            realLongIns1.Score = 100;
            var realLongIns2 = new PreIndel(new CandidateAllele("chr1", 854, "N", "NATTCGATTCTATTCATTGATGATTCCATTTGATTCCATTCGATGATGACTGCCTTCAGTTCCATTCGGTGATGATTCCAACAGATTCCATTTGGTGACTCA", AlleleCategory.Insertion));

            realLongIns2.Score = 20;
            preIndels          = new List <PreIndel>()
            {
                realLongIns1, realLongIns2
            };

            indelSource  = new HashableIndelSource();
            chrReference = new ChrReference()
            {
                FastaPath = "abc",
                Name      = "chr1",
                Sequence  = new string('A', 3000)
            };

            finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference);
            Assert.Equal(2, finalIndels.Count);

            // Long deletion - should adjust snippet width to accomodate
            var longDel1 = new PreIndel(new CandidateAllele("chr1", 100, new string('N', 200), "N", AlleleCategory.Deletion));

            longDel1.Score = 100;
            var longDel2 = new PreIndel(new CandidateAllele("chr1", 150, new string('N', 200), "N", AlleleCategory.Deletion));

            longDel2.Score = 20;
            preIndels      = new List <PreIndel>()
            {
                longDel1, longDel2
            };

            indelSource  = new HashableIndelSource();
            chrReference = new ChrReference()
            {
                FastaPath = "abc",
                Name      = "chr1",
                Sequence  = new string('A', 100) + new string('T', 100) + new string('C', 1000)
            };

            finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference);
            Assert.Equal(2, finalIndels.Count);

            chrReference = new ChrReference()
            {
                FastaPath = "abc",
                Name      = "chr1",
                Sequence  = new string('A', 100) + new string('T', 500) + new string('C', 1000)
            };

            finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference);
            Assert.Equal(1, finalIndels.Count);


            //         012345678901234567890
            // ...CCCCCCGGGGGGGGAGGTTTTTAAAAATATATA
            // ...CCCCCC---GGGGGAGGTTTTTAAAAATATATA // del 1
            // ...CCCCCCGGGGGGGG---TTTTTAAAAATATATA // del 2
            // ...CCCCCCGGGGGGGGA---TTTTAAAAATATATA // del 3
            // ...CCCCCCGGGGGAGGTTTTTAAAAATATATA // effective 1
            // ...CCCCCCGGGGGGGGTTTTTAAAAATATATA // effective 2
            // ...CCCCCCGGGGGGGGATTTTAAAAATATATA // effective 3 - edit distance of 2 from eff1, 1 from eff2

            var homopolymerDel = new PreIndel(new CandidateAllele("chr1", 300, "NNNN", "N", AlleleCategory.Deletion));

            homopolymerDel.Score = 100;
            var homopolymerDelMuchWeakerOneMismatch = new PreIndel(new CandidateAllele("chr1", 308, "NNNN", "N", AlleleCategory.Deletion));

            homopolymerDelMuchWeakerOneMismatch.Score = 10;
            var homopolymerDelMuchWeakerTwoMismatch = new PreIndel(new CandidateAllele("chr1", 309, "NNNN", "N", AlleleCategory.Deletion));

            homopolymerDelMuchWeakerTwoMismatch.Score = 10;
            preIndels = new List <PreIndel>()
            {
                homopolymerDel, homopolymerDelMuchWeakerOneMismatch, homopolymerDelMuchWeakerTwoMismatch
            };

            indelSource  = new HashableIndelSource();
            chrReference = new ChrReference()
            {
                FastaPath = "abc",
                Name      = "chr1",
                Sequence  = new string('C', 300) + "GGGGGGGGAGGTTTTTAAAAATATATA" + new string('G', 300)
            };
            finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference);
            Assert.Equal(2, finalIndels.Count);
            CheckForIndel(finalIndels, 300, "CGGG", "C", 100);
            EnsureIndelNotPresent(finalIndels, 308, "GAGG", "G");
            CheckForIndel(finalIndels, 309, "AGGT", "A", 10);

            // Same deletions but flip the scores -- The deletions have very similar consequences, but there is not a clear stronger deletion, which makes us less confident that these are mismatching versions of the same deletion. Keep all.
            homopolymerDelMuchWeakerTwoMismatch.Score = 60;
            homopolymerDelMuchWeakerOneMismatch.Score = 60;
            finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference);
            Assert.Equal(3, finalIndels.Count);
            CheckForIndel(finalIndels, 300, "CGGG", "C", 100);
            CheckForIndel(finalIndels, 308, "GAGG", "G", 60);
            CheckForIndel(finalIndels, 309, "AGGT", "A", 60);

            // Same deletions but flip the scores -- The strongest deletion is edit distance of 1 away from both of the others
            homopolymerDel.Score = 40;
            homopolymerDelMuchWeakerTwoMismatch.Score = 10;
            homopolymerDelMuchWeakerOneMismatch.Score = 100;
            finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference);
            Assert.Equal(1, finalIndels.Count);
            EnsureIndelNotPresent(finalIndels, 300, "CGGG", "C");
            CheckForIndel(finalIndels, 308, "GAGG", "G", 100);
            EnsureIndelNotPresent(finalIndels, 309, "AGGT", "A");
        }
Example #21
0
        public static HashableIndel GetHashableIndel(GenomeSnippet snippet, PreIndel preIndel, int contextStart, bool debug)
        {
            var actualReferenceAllele = ActualReferenceAllele(snippet, preIndel, contextStart);

            var actualAltAllele = ActualAltAllele(preIndel, actualReferenceAllele);

            var indelType = actualReferenceAllele.Length > actualAltAllele.Length
                ? AlleleCategory.Deletion
                : AlleleCategory.Insertion;

            string repeatUnit;
            var    variantBases = indelType == AlleleCategory.Insertion
                ? actualAltAllele.Substring(1)
                : actualReferenceAllele.Substring(1);

            const int maxRepeatUnitLength = 3;
            var       isRepeat            = StitchingLogic.OverlapEvaluator.IsRepeat(variantBases, maxRepeatUnitLength
                                                                                     , out repeatUnit);

            var isDuplication = Helper.IsDuplication(snippet.Sequence, preIndel.ReferencePosition, isRepeat, repeatUnit, actualAltAllele);

            var numRepeatsLeft = 0;
            var numRepeats     = 0;

            if (indelType == AlleleCategory.Insertion && preIndel.Length > 3)
            {
                var currentPos = preIndel.ReferencePosition - snippet.StartPosition;
                while (true)
                {
                    // TODO < or <=
                    if (snippet.Sequence.Length <= currentPos + preIndel.Length)
                    {
                        break;
                    }
                    // Need to go both directions because we're allowing inexact.
                    var referenceAfterInsertion = snippet.Sequence.Substring(currentPos, preIndel.Length);

                    bool stillMatch = false;
                    if (referenceAfterInsertion != variantBases)
                    {
                        var numMismatches = Helper.GetHammingNumMismatches(referenceAfterInsertion, variantBases);
                        if (numMismatches <= 1)
                        {
                            stillMatch = true;
                        }
                    }
                    else
                    {
                        stillMatch = true;
                    }

                    if (stillMatch)
                    {
                        numRepeats++;
                        currentPos += preIndel.Length;
                    }
                    else
                    {
                        break;
                    }
                }

                var currentPosLeft = preIndel.ReferencePosition - preIndel.Length - snippet.StartPosition;
                while (true)
                {
                    // Need to go both directions because we're allowing inexact.
                    if (currentPosLeft < 0)
                    {
                        break;
                    }
                    var referenceAfterInsertion = snippet.Sequence.Substring(currentPosLeft, preIndel.Length);

                    bool stillMatch = false;
                    if (referenceAfterInsertion != variantBases)
                    {
                        var numMismatches = Helper.GetHammingNumMismatches(referenceAfterInsertion, variantBases);
                        if (numMismatches <= 1)
                        {
                            stillMatch = true;
                        }
                    }
                    else
                    {
                        stillMatch = true;
                    }

                    if (stillMatch)
                    {
                        numRepeatsLeft++;
                        currentPosLeft -= preIndel.Length;
                    }
                    else
                    {
                        break;
                    }
                }
            }

            string newRepeatUnit;
            var    repeats = Helper.ComputeRMxNLengthForIndel(preIndel.ReferencePosition - snippet.StartPosition, variantBases, snippet.Sequence, 6, out newRepeatUnit);

            if (repeats >= 6) // TODO make this configurable?
            {
                isRepeat   = true;
                repeatUnit = newRepeatUnit;
            }

            string otherIndel = "";

            if (preIndel.InMulti)
            {
                var otherAsPre = GetIndelKey(preIndel.OtherIndel);
                otherAsPre.ReferenceAllele = ActualReferenceAllele(snippet, otherAsPre, contextStart);
                otherAsPre.AlternateAllele = ActualAltAllele(otherAsPre, otherAsPre.ReferenceAllele);
                otherIndel = Helper.CandidateToString(otherAsPre);
            }

            var length = Math.Abs(actualReferenceAllele.Length - actualAltAllele.Length);
            var isUntrustworthyInRepeatRegion = false;

            if (length == 1)
            {
                isUntrustworthyInRepeatRegion = Helper.IsInHomopolymerStretch(snippet.Sequence, preIndel.ReferencePosition);
            }

            // TODO ADD TESTS!!
            var refPrefix = ReferencePrefix(snippet, preIndel, contextStart);
            var refSuffix = ReferenceSuffix(snippet, preIndel, contextStart);

            //Read-end repeats of this repeat unit that are this length or smaller should not be trusted as insertion evidence, but larger ones can
            var numBasesBeforeInsertionUnique = 0;

            if (indelType == AlleleCategory.Insertion)
            {
                var sequenceToCheckFor = isRepeat ? repeatUnit : actualAltAllele;

                for (int i = 0; i < refSuffix.Length - sequenceToCheckFor.Length; i += sequenceToCheckFor.Length)
                {
                    if (refSuffix.Substring(i, sequenceToCheckFor.Length) == sequenceToCheckFor)
                    {
                        numBasesBeforeInsertionUnique++;
                    }
                    else
                    {
                        break;
                    }
                }
            }


            var indelIdentifier = new HashableIndel
            {
                Chromosome        = preIndel.Chromosome,
                ReferencePosition = preIndel.ReferencePosition,
                ReferenceAllele   = actualReferenceAllele,
                AlternateAllele   = actualAltAllele,
                Type          = indelType,
                Length        = length,
                Score         = preIndel.Score,
                InMulti       = preIndel.InMulti,
                OtherIndel    = otherIndel,
                IsRepeat      = isRepeat,
                RepeatUnit    = repeatUnit,
                IsDuplication = isDuplication,
                IsUntrustworthyInRepeatRegion = isUntrustworthyInRepeatRegion,
                RefPrefix = refPrefix,
                RefSuffix = refSuffix,
                NumBasesInReferenceSuffixBeforeUnique = numBasesBeforeInsertionUnique,
                NumRepeatsNearby   = repeats,
                NumApproxDupsLeft  = numRepeatsLeft,
                NumApproxDupsRight = numRepeats
            };

            indelIdentifier = Helper.CopyHashable(indelIdentifier, otherIndel);

            if (isDuplication && debug)
            {
                Console.WriteLine($"Found a duplication: {indelIdentifier.StringRepresentation}");
            }

            if (isRepeat && debug)
            {
                Console.WriteLine($"Found a repeat: {indelIdentifier.StringRepresentation}, {repeatUnit}");
            }

            return(indelIdentifier);
        }
Example #22
0
        private static void PruneOverlappingIndels(List <PreIndel> indelsForChromRaw, PreIndel indel, List <PreIndel> blacklistedIndels, int binSize)
        {
            // Prune out stuff that would be overlapping or within buffer
            var nearbyIndels = indelsForChromRaw.Where(x =>
                                                       !IndelsMatch(indel, x) && (Math.Abs(x.ReferencePosition - indel.ReferencePosition) <=
                                                                                  binSize + (indel.Type == AlleleCategory.Deletion ? indel.Length : 0)
                                                                                  ))
                               .ToList();

            // TODO consider making the threshold less for larger binsizes?
            var  allScores   = nearbyIndels.Select(x => x.Score).OrderByDescending(x => x);
            long sumOfScores = allScores.Sum() + indel.Score;

            if ((indel.Score / (float)sumOfScores) > 0.33)
            {
                var indelsToBlacklist = nearbyIndels.Where(x =>
                                                           !(
                                                               // Same allele
                                                               (x.ReferencePosition == indel.ReferencePosition &&
                                                                x.ReferenceAllele == indel.ReferenceAllele &&
                                                                x.AlternateAllele == indel.AlternateAllele)
                                                               ||
                                                               // Indel contained in multi other, and other is at least ok-ish quality
                                                               (!indel.InMulti && x.InMulti && Helper.MultiIndelContainsIndel(x, indel) &&
                                                                x.Score >= (indel.Score * 0.3))
                                                               ||
                                                               // Other contained in multi indel, and other is at least ok-ish quality
                                                               (indel.InMulti && !x.InMulti && Helper.MultiIndelContainsIndel(indel, x) &&
                                                                x.Score >= (indel.Score * 0.3))
                                                               ) &&
                                                           (
                                                               // (Much) lower scoring, shorter indel of the same type is likely to just be noise around this
                                                               // Note: this could be an issue with concurrent somatic and germline variants, ie if these two observed indels do _not_ represent the same biological event
                                                               x.Score < (indel.Score * 0.5) && x.Length <= indel.Length) && x.Type == indel.Type).ToList();

                foreach (var nearbyIndel in indelsToBlacklist)
                {
                    blacklistedIndels.Add(nearbyIndel);
                }
            }
        }