/// <summary> /// Performs Stage 1, 2, and 3 as described in class description. /// </summary> /// <param name="sequences">input unaligned sequences</param> public IList <MBF.Algorithms.Alignment.ISequenceAlignment> Align(IList <ISequence> sequences) { // Initializations if (sequences.Count > 0) { if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(sequences[0].Alphabet); } else { ConsensusResolver.SequenceAlphabet = sequences[0].Alphabet; } } // Get ProfileAligner ready IProfileAligner profileAligner = null; switch (_profileAlignerName) { case (ProfileAlignerNames.NeedlemanWunschProfileAligner): if (_degreeOfParallelism == 1) { profileAligner = new NeedlemanWunschProfileAlignerSerial( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } else { profileAligner = new NeedlemanWunschProfileAlignerParallel( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } break; case (ProfileAlignerNames.SmithWatermanProfileAligner): if (_degreeOfParallelism == 1) { profileAligner = new SmithWatermanProfileAlignerSerial( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } else { profileAligner = new SmithWatermanProfileAlignerParallel( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } break; default: throw new ArgumentException("Invalid profile aligner name"); } _alignedSequences = new List <ISequence>(sequences.Count); float currentScore = 0; // STAGE 1 Performance.Snapshot("Stage 1"); // Generate DistanceMatrix KmerDistanceMatrixGenerator kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, _kmerLength, _moleculeType, _distanceFunctionName); // Hierarchical clustering IHierarchicalClustering hierarcicalClustering = new HierarchicalClusteringParallel (kmerDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName); // Generate Guide Tree BinaryGuideTree binaryGuideTree = new BinaryGuideTree(hierarcicalClustering); // Progressive Alignment IProgressiveAligner progressiveAlignerA = new ProgressiveAligner(profileAligner); progressiveAlignerA.Align(sequences, binaryGuideTree); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerA.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > _alignmentScoreA) { _alignmentScoreA = currentScore; _alignedSequencesA = progressiveAlignerA.AlignedSequences; } if (_alignmentScoreA > _alignmentScore) { _alignmentScore = _alignmentScoreA; _alignedSequences = _alignedSequencesA; } if (PAMSAMMultipleSequenceAligner.FasterVersion) { _alignedSequencesB = _alignedSequencesA; _alignedSequencesC = _alignedSequencesA; _alignmentScoreB = _alignmentScoreA; _alignmentScoreC = _alignmentScoreA; } else { BinaryGuideTree binaryGuideTreeB = null; IHierarchicalClustering hierarcicalClusteringB = null; KimuraDistanceMatrixGenerator kimuraDistanceMatrixGenerator = new KimuraDistanceMatrixGenerator(); if (PAMSAMMultipleSequenceAligner.UseStageB) { // STAGE 2 Performance.Snapshot("Stage 2"); // Generate DistanceMatrix from Multiple Sequence Alignment int iterateTime = 0; while (true) { ++iterateTime; kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(_alignedSequences); // Hierarchical clustering hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName); // Generate Guide Tree binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); BinaryGuideTree.CompareTwoTrees(binaryGuideTreeB, binaryGuideTree); binaryGuideTree = binaryGuideTreeB; // Progressive Alignment IProgressiveAligner progressiveAlignerB = new ProgressiveAligner(profileAligner); progressiveAlignerB.Align(sequences, binaryGuideTreeB); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerB.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > _alignmentScoreB) { _alignmentScoreB = currentScore; _alignedSequencesB = progressiveAlignerB.AlignedSequences; break; } else { break; } } if (_alignmentScoreB > _alignmentScore) { _alignmentScore = _alignmentScoreB; _alignedSequences = _alignedSequencesB; } } else { binaryGuideTreeB = binaryGuideTree; } // STAGE 3 Performance.Snapshot("Stage 3"); // refinement //int maxRefineMentTime = sequences.Count * 2 - 2; int maxRefineMentTime = 1; if (sequences.Count == 2) { maxRefineMentTime = 0; } int refinementTime = 0; _alignedSequencesC = new List <ISequence>(sequences.Count); for (int i = 0; i < sequences.Count; ++i) { _alignedSequencesC.Add(new Sequence(_alphabet, _alignedSequences[i].ToString())); } List <int>[] leafNodeIndices = null; List <int>[] allIndelPositions = null; IProfileAlignment[] separatedProfileAlignments = null; List <int>[] eStrings = null; while (refinementTime < maxRefineMentTime) { ++refinementTime; Performance.Snapshot("Refinement iter " + refinementTime.ToString()); bool needRefinement = false; for (int edgeIndex = 0; edgeIndex < binaryGuideTreeB.NumberOfEdges; ++edgeIndex) { leafNodeIndices = binaryGuideTreeB.SeparateSequencesByCuttingTree(edgeIndex); allIndelPositions = new List <int> [2]; separatedProfileAlignments = ProfileAlignment.ProfileExtraction(_alignedSequencesC, leafNodeIndices[0], leafNodeIndices[1], out allIndelPositions); eStrings = new List <int> [2]; if (separatedProfileAlignments[0].NumberOfSequences < separatedProfileAlignments[1].NumberOfSequences) { profileAligner.Align(separatedProfileAlignments[0], separatedProfileAlignments[1]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedA); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedB); } else { profileAligner.Align(separatedProfileAlignments[1], separatedProfileAlignments[0]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedB); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedA); } for (int set = 0; set < 2; ++set) { Parallel.ForEach(leafNodeIndices[set], PAMSAMMultipleSequenceAligner.parallelOption, i => { Sequence seq = new Sequence(_alphabet, ""); seq.IsReadOnly = false; int indexAllIndel = 0; for (int j = 0; j < _alignedSequencesC[i].Count; ++j) { if (indexAllIndel < allIndelPositions[set].Count && j == allIndelPositions[set][indexAllIndel]) { ++indexAllIndel; } else { seq.Add(_alignedSequencesC[i][j]); } } seq = profileAligner.GenerateSequenceFromEString(eStrings[set], seq); seq.IsReadOnly = true; _alignedSequencesC[i] = seq; }); } currentScore = MsaUtils.MultipleAlignmentScoreFunction(_alignedSequencesC, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > _alignmentScoreC) { _alignmentScoreC = currentScore; needRefinement = true; // recreate the tree kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(_alignedSequencesC); hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName); binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); break; } } if (!needRefinement) { refinementTime = maxRefineMentTime; break; } } if (_alignmentScoreC > _alignmentScore) { _alignmentScore = _alignmentScoreC; _alignedSequences = _alignedSequencesC; } Performance.Snapshot("Stop Stage 3"); } //just for the purpose of integrating PW and MSA with the same output IList <MBF.Algorithms.Alignment.ISequenceAlignment> results = new List <MBF.Algorithms.Alignment.ISequenceAlignment>(); return(results); }
/// <summary> /// Construct an aligner /// </summary> /// <param name="sequences">input sequences</param> /// <param name="moleculeType">molecular type: Protein, DNA or RNA</param> /// <param name="kmerLength">positive integer of kmer length</param> /// <param name="distanceFunctionName">enum: distance function name</param> /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param> /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param> /// <param name="profileFunctionName">enum: profile-profile distance function</param> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="gapOpenPenalty">negative gapOpenPenalty</param> /// <param name="gapExtendPenalty">negative gapExtendPenalty</param> /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param> /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param> public PAMSAMMultipleSequenceAligner( IList <ISequence> sequences, MoleculeType moleculeType, int kmerLength, DistanceFunctionTypes distanceFunctionName, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, ProfileAlignerNames profileAlignerMethodName, ProfileScoreFunctionNames profileFunctionName, SimilarityMatrix similarityMatrix, int gapOpenPenalty, int gapExtendPenalty, int numberOfPartitions, int degreeOfParallelism) { Performance.Start(); if (null == sequences) { throw new ArgumentNullException("sequences"); } if (sequences.Count == 0) { throw new ArgumentException("Empty input sequences"); } // Set parallel extension option if (degreeOfParallelism <= 0) { throw new ArgumentException("Invalid parallel degree parameter"); } PAMSAMMultipleSequenceAligner.parallelOption = new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism }; if (numberOfPartitions <= 0) { throw new ArgumentException("Invalid number of partition parameter"); } _numberOfPartitions = numberOfPartitions; // Validate data type _alphabet = sequences[0].Alphabet; Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, i => { if (sequences[i].Alphabet != _alphabet) { throw new ArgumentException("Inconsistent sequence alphabet"); } }); List <String> similarityMatrixDNA = new List <String>(); similarityMatrixDNA.Add("AmbiguousDNA"); List <String> similarityMatrixRNA = new List <String>(); similarityMatrixRNA.Add("AmbiguousRNA"); List <String> similarityMatrixProtein = new List <String>(); similarityMatrixProtein.Add("BLOSUM45"); similarityMatrixProtein.Add("BLOSUM50"); similarityMatrixProtein.Add("BLOSUM62"); similarityMatrixProtein.Add("BLOSUM80"); similarityMatrixProtein.Add("BLOSUM90"); similarityMatrixProtein.Add("PAM250"); similarityMatrixProtein.Add("PAM30"); similarityMatrixProtein.Add("PAM70"); switch (_alphabet.Name) { case ("DNA"): if (moleculeType != MoleculeType.DNA) { throw new ArgumentException("Inconsistent molecule type"); } if (!similarityMatrixDNA.Contains(similarityMatrix.Name)) { throw new ArgumentException("Inconsistent similarity matrix"); } break; case ("Protein"): if (moleculeType != MoleculeType.Protein) { throw new ArgumentException("Inconsistent molecule type"); } if (!similarityMatrixProtein.Contains(similarityMatrix.Name)) { throw new ArgumentException("Inconsistent similarity matrix"); } break; case ("RNA"): if (moleculeType != MoleculeType.RNA) { throw new ArgumentException("Inconsistent molecule type"); } if (!similarityMatrixRNA.Contains(similarityMatrix.Name)) { throw new ArgumentException("Inconsistent similarity matrix"); } break; default: throw new ArgumentException("Invalid alphabet"); } // Initialize parameters _moleculeType = moleculeType; _kmerLength = kmerLength; _distanceFunctionName = distanceFunctionName; _hierarchicalClusteringMethodName = hierarchicalClusteringMethodName; _profileAlignerName = profileAlignerMethodName; _profileProfileFunctionName = profileFunctionName; SimilarityMatrix = similarityMatrix; GapOpenCost = gapOpenPenalty; GapExtensionCost = gapExtendPenalty; MsaUtils.SetProfileItemSets(moleculeType); switch (moleculeType) { case (MoleculeType.DNA): _alphabet = Alphabets.DNA; break; case (MoleculeType.Protein): _alphabet = Alphabets.Protein; break; case (MoleculeType.RNA): _alphabet = Alphabets.RNA; break; default: throw new Exception("Invalid molecular type"); } Performance.Snapshot("Start Aligning"); // Work... Align(sequences); }