/// <summary> /// Given two byte arrays representing a pairwise alignment, shift them so /// that all deletions start as early as possible. For example: /// /// <code> /// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT /// TTTTAA--TTTT TTTT--AATTTT /// </code> /// /// This function takes a IPairwiseSequenceAlignment and assumes that the first sequence is the reference and second /// sequence is the query. It returns a new Pairwise sequence alignment with all of the indels left aligned as well as a list of variants. /// </summary> /// <param name="aln">Aln. The second sequence should be of type QualitativeSequence or Sequence</param> /// <param name="callVariants">callVariants. If true, it will call variants, otherwise the second half of tuple will be null. </param> public static Tuple<IPairwiseSequenceAlignment, List<Variant>> LeftAlignIndelsAndCallVariants(IPairwiseSequenceAlignment aln, bool callVariants = true) { if (aln == null) { throw new NullReferenceException ("aln"); } if (aln.PairwiseAlignedSequences == null || aln.PairwiseAlignedSequences.Count != 1) { throw new ArgumentException ("The pairwise aligned sequence should only have one alignment"); } var frstAln = aln.PairwiseAlignedSequences.First (); var seq1 = frstAln.FirstSequence; var seq2 = frstAln.SecondSequence; if (seq1 == null) { throw new NullReferenceException ("seq1"); } else if (seq2 == null) { throw new NullReferenceException ("seq2"); } //TODO: Might implement an ambiguity check later. #if FALSE if (seq1.Alphabet.HasAmbiguity || seq2.Alphabet.HasAmbiguity) { throw new ArgumentException ("Cannot left align sequences with ambiguous symbols."); } #endif // Note we have to copy unless we can guarantee the array will not be mutated. byte[] refseq = seq1.ToArray (); ISequence newQuery; List<Variant> variants = null; // Call variants for a qualitative sequence if (seq2 is QualitativeSequence) { var qs = seq2 as QualitativeSequence; var query = Enumerable.Zip (qs, qs.GetQualityScores (), (bp, qv) => new BPandQV (bp, (byte)qv, false)).ToArray (); AlignmentUtils.LeftAlignIndels (refseq, query); AlignmentUtils.VerifyNoGapsOnEnds (refseq, query); if (callVariants) { variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement()); } var newQueryQS = new QualitativeSequence (qs.Alphabet, qs.FormatType, query.Select (z => z.BP).ToArray (), query.Select (p => p.QV).ToArray (), false); newQueryQS.Metadata = seq2.Metadata; newQuery = newQueryQS; } else if (seq2 is Sequence) { // For a sequence with no QV values. var qs = seq2 as Sequence; var query = qs.Select (v => new BPandQV (v, 0, false)).ToArray(); AlignmentUtils.LeftAlignIndels (refseq, query); AlignmentUtils.VerifyNoGapsOnEnds (refseq, query); // ISequence does not have a setable metadata var newQueryS = new Sequence(qs.Alphabet, query.Select(z=>z.BP).ToArray(), false); newQueryS.Metadata = seq2.Metadata; if (callVariants) { variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement()); } newQuery = newQueryS; } else { throw new ArgumentException ("Can only left align indels if the query sequence is of type Sequence or QualitativeSequence."); } if (aln.FirstSequence != null && aln.FirstSequence.ID != null) { foreach (var v in variants) { v.RefName = aln.FirstSequence.ID; } } var newRef = new Sequence (seq1.Alphabet, refseq, false); newRef.ID = seq1.ID; newRef.Metadata = seq1.Metadata; newQuery.ID = seq2.ID; var newaln = new PairwiseSequenceAlignment (aln.FirstSequence, aln.SecondSequence); var pas = new PairwiseAlignedSequence (); pas.FirstSequence = newRef; pas.SecondSequence = newQuery; newaln.Add (pas); return new Tuple<IPairwiseSequenceAlignment, List<Variant>> (newaln, variants); }
public static void TestExceptionThrownForUnclippedAlignment() { var refseq = "ACAATATA"; var queryseq = "ACAATAT-"; var r = new Sequence (DnaAlphabet.Instance, refseq); var q = new Sequence (DnaAlphabet.Instance, queryseq); var aln = new PairwiseSequenceAlignment (r, q); var pas = new PairwiseAlignedSequence (); pas.FirstSequence = r; pas.SecondSequence = q; aln.Add (pas); Assert.Throws<FormatException> (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true)); refseq = "AAACAATATA"; queryseq = "AA-CAATATA"; r = new Sequence (DnaAlphabet.Instance, refseq); q = new Sequence (DnaAlphabet.Instance, queryseq); aln = new PairwiseSequenceAlignment (r, q); pas = new PairwiseAlignedSequence (); pas.FirstSequence = r; pas.SecondSequence = q; aln.Add (pas); Assert.Throws<FormatException> (() => VariantCaller.LeftAlignIndelsAndCallVariants (aln, true)); }
public static void TestLeftAlignmentStep() { var refseq = "ACAATAAAAGCGCGCGCGCGTTACGTATAT--ATGGATAT"; var queryseq = "ACAATAA-AGC--GCGC--GTTACGTATATATATGGATAT"; var r = new Sequence (DnaAlphabet.Instance, refseq); var q = new Sequence (DnaAlphabet.Instance, queryseq); var aln = new PairwiseSequenceAlignment (r, q); var pas = new PairwiseAlignedSequence (); pas.FirstSequence = r; pas.SecondSequence = q; aln.Add (pas); var tpl = VariantCaller.LeftAlignIndelsAndCallVariants (aln, true); // Check the left alignment aln = tpl.Item1 as PairwiseSequenceAlignment; var lar = aln.PairwiseAlignedSequences [0].FirstSequence.ConvertToString(); var laq = aln.PairwiseAlignedSequences [0].SecondSequence.ConvertToString(); var exprefseq = "ACAATAAAAGCGCGCGCGCGTTACG--TATATATGGATAT"; var expqueryseq = "ACAAT-AAA----GCGCGCGTTACGTATATATATGGATAT"; Assert.AreEqual (exprefseq, lar); Assert.AreEqual (expqueryseq, laq); // And it's hard, so we might as well check the variants var variants = tpl.Item2; Assert.AreEqual (3, variants.Count); string[] bases = new string[] { "A", "GCGC", "TA" }; char[] hpbases = new char[] { 'A', 'G', 'T' }; bool[] inHp = new bool[] { true, false, false }; int[] lengths = new int[] { 1, 4, 2 }; int[] starts = new int[] { 4, 8, 24 }; IndelType[] types = new IndelType[] { IndelType.Deletion, IndelType.Deletion, IndelType.Insertion }; for (int i = 0; i < 3; i++) { Assert.AreEqual (VariantType.INDEL, variants [i].Type); var vi = variants [i] as IndelVariant; Assert.AreEqual (hpbases[i], vi.HomopolymerBase); Assert.AreEqual (starts [i], vi.StartPosition); Assert.AreEqual (lengths [i], vi.Length); Assert.AreEqual (bases [i], vi.InsertedOrDeletedBases); Assert.AreEqual (inHp [i], vi.InHomopolymer); Assert.AreEqual (types [i], vi.InsertionOrDeletion); } }
private void ValidateGeneralSequenceAlignment(string nodeName, bool validateProperty) { // Read the xml file for getting both the files for aligning. string origSequence1 = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequenceNode1); string origSequence2 = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequenceNode2); IAlphabet alphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.AlphabetNameNode)); ApplicationLog.WriteLine(string.Format("SequenceAlignment P1 : First sequence used is '{0}'.", origSequence1)); ApplicationLog.WriteLine(string.Format("SequenceAlignment P1 : Second sequence used is '{0}'.", origSequence2)); // Create two sequences ISequence aInput = new Sequence(alphabet, origSequence1); ISequence bInput = new Sequence(alphabet, origSequence2); // Add the sequences to the Sequence alignment object using AddSequence() method. IList<IPairwiseSequenceAlignment> sequenceAlignmentObj = new List<IPairwiseSequenceAlignment>(); var alignSeq = new PairwiseAlignedSequence {FirstSequence = aInput, SecondSequence = bInput}; IPairwiseSequenceAlignment seqAlignObj = new PairwiseSequenceAlignment(); seqAlignObj.Add(alignSeq); sequenceAlignmentObj.Add(seqAlignObj); // Read the output back and validate the same. IList<PairwiseAlignedSequence> newAlignedSequences = sequenceAlignmentObj[0].PairwiseAlignedSequences; ApplicationLog.WriteLine(string.Format("SequenceAlignment P1 : First sequence read is '{0}'.", origSequence1)); ApplicationLog.WriteLine(string.Format("SequenceAlignment P1 : Second sequence read is '{0}'.", origSequence2)); if (validateProperty) { string score = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.MatchScoreNode); string seqCount = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequenceCountNode); Assert.IsFalse(sequenceAlignmentObj.IsReadOnly); Assert.AreEqual(sequenceAlignmentObj.Count.ToString((IFormatProvider) null), seqCount); Assert.AreEqual( sequenceAlignmentObj[0].PairwiseAlignedSequences[0].Score.ToString((IFormatProvider) null), score); Assert.AreEqual(sequenceAlignmentObj.Count.ToString((IFormatProvider) null), seqCount); ApplicationLog.WriteLine("SequenceAlignment P1 : Successfully validated the IsRead Property"); ApplicationLog.WriteLine("SequenceAlignment P1 : Successfully validated the Count Property"); ApplicationLog.WriteLine("SequenceAlignment P1 : Successfully validated the Sequences Property"); } else { Assert.AreEqual(new String(newAlignedSequences[0].FirstSequence.Select(a => (char) a).ToArray()), origSequence1); Assert.AreEqual(new String(newAlignedSequences[0].SecondSequence.Select(a => (char) a).ToArray()), origSequence2); } }
public void ValidateSequenceAlignmentProperties() { // Read the xml file for getting both the files for aligning. string origSequence1 = this.utilityObj.xmlUtil.GetTextValue(Constants.AlignDnaAlgorithmNodeName, Constants.SequenceNode1); string origSequence2 = this.utilityObj.xmlUtil.GetTextValue(Constants.AlignDnaAlgorithmNodeName, Constants.SequenceNode2); IAlphabet alphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue( Constants.AlignDnaAlgorithmNodeName, Constants.AlphabetNameNode)); string seqCount = this.utilityObj.xmlUtil.GetTextValue( Constants.AlignDnaAlgorithmNodeName, Constants.SequenceCountNode); // Create two sequences ISequence aInput = new Sequence(alphabet, origSequence1); ISequence bInput = new Sequence(alphabet, origSequence2); // Add the sequences to the Sequence alignment object using AddSequence() method. IList<IPairwiseSequenceAlignment> sequenceAlignmentObj = new List<IPairwiseSequenceAlignment>(); var alignSeq = new PairwiseAlignedSequence(); alignSeq.FirstSequence = aInput; alignSeq.SecondSequence = bInput; IPairwiseSequenceAlignment seqAlignObj = new PairwiseSequenceAlignment(aInput, bInput); seqAlignObj.Add(alignSeq); sequenceAlignmentObj.Add(seqAlignObj); // Validate all properties of sequence alignment class. Assert.AreEqual(seqCount, seqAlignObj.Count.ToString((IFormatProvider) null)); Assert.AreEqual(origSequence1, new string(seqAlignObj.FirstSequence.Select(a => (char) a).ToArray())); Assert.AreEqual(origSequence2, new string(seqAlignObj.SecondSequence.Select(a => (char) a).ToArray())); Assert.IsFalse(seqAlignObj.IsReadOnly); Assert.IsNull(seqAlignObj.Documentation); Assert.AreEqual(seqCount, seqAlignObj.PairwiseAlignedSequences.Count.ToString((IFormatProvider) null)); ApplicationLog.WriteLine("SequenceAlignment P1 : Successfully validated the IsRead Property"); ApplicationLog.WriteLine("SequenceAlignment P1 : Successfully validated the Count Property"); ApplicationLog.WriteLine("SequenceAlignment P1 : Successfully validated the Sequences Property"); }
public void SequenceAlignmentAddSequence() { // Read the xml file for getting both the files for aligning. string origSequence1 = this.utilityObj.xmlUtil.GetTextValue(Constants.AlignAlgorithmNodeName, Constants.SequenceNode1); string origSequence2 = this.utilityObj.xmlUtil.GetTextValue(Constants.AlignAlgorithmNodeName, Constants.SequenceNode2); IAlphabet alphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(Constants.AlignAlgorithmNodeName, Constants.AlphabetNameNode)); ApplicationLog.WriteLine(string.Format(null, "SequenceAlignment BVT : First sequence used is '{0}'.", origSequence1)); ApplicationLog.WriteLine(string.Format(null,"SequenceAlignment BVT : Second sequence used is '{0}'.", origSequence2)); // Create two sequences ISequence aInput = new Sequence(alphabet, origSequence1); ISequence bInput = new Sequence(alphabet, origSequence2); // Add the sequences to the Sequence alignment object using AddSequence() method. IList<IPairwiseSequenceAlignment> sequenceAlignmentObj = new List<IPairwiseSequenceAlignment>(); var alignSeq = new PairwiseAlignedSequence {FirstSequence = aInput, SecondSequence = bInput}; IPairwiseSequenceAlignment seqAlignObj = new PairwiseSequenceAlignment(); seqAlignObj.Add(alignSeq); sequenceAlignmentObj.Add(seqAlignObj); // Read the output back and validate the same. IList<PairwiseAlignedSequence> newAlignedSequences = sequenceAlignmentObj[0].PairwiseAlignedSequences; ApplicationLog.WriteLine(string.Format(null, "SequenceAlignment BVT : First sequence read is '{0}'.", origSequence1)); ApplicationLog.WriteLine(string.Format(null, "SequenceAlignment BVT : Second sequence read is '{0}'.", origSequence2)); Assert.AreEqual(newAlignedSequences[0].FirstSequence.ConvertToString(), origSequence1); Assert.AreEqual(newAlignedSequences[0].SecondSequence.ConvertToString(), origSequence2); }