/// <summary> /// Calculate sequence weights from the guide tree /// </summary> /// <param name="tree">a binary guide tree</param> public SequenceWeighting(BinaryGuideTree tree) { _weights = new float[tree.NumberOfLeaves]; BinaryGuideTreeEdge _edge; BinaryGuideTreeNode _node; // Initialize: all weights are 0. // Then sum up the weights from the leaf to the root for (int i = 0; i < _weights.Length; ++i) { _weights[i] = 0; _node = tree.Nodes[i]; while (!_node.IsRoot) { _edge = _node.ParentEdge; _weights[i] += _edge.Length; _node = _node.Parent; } } // Normalize so that the average is 1. float s = 0; for (int i = 0; i < _weights.Length; ++i) { s += _weights[i]; } for (int i = 0; i < _weights.Length; ++i) { _weights[i] = _weights[i] * _weights.Length / s; _weights[i] = 1 / Weights[i]; } }
/// <summary> /// Compare two guide (sub)trees and mark the nodes that need to be re-aligned. /// /// The algorithm traverses tree A in prefix order (children before parents), /// assigning internal nodes ids N+1 through 2N-1 in the order visited. When visiting /// an internal node, if any child node needs to be re-aligned, the node needs to /// be re-aligned too. If the two children are both unmarked, and the two children nodes /// are also having the same parent in tree B, this internal node does not need to be /// re-aligned, and be assigned an ID the same as the parent node in tree B. /// </summary> /// <param name="treeA">binary guide (sub)tree</param> /// <param name="treeB">binary guide (sub)tree</param> public static void CompareTwoTrees(BinaryGuideTree treeA, BinaryGuideTree treeB) { if (treeA.NumberOfNodes != treeB.NumberOfNodes || treeA.NumberOfLeaves != treeB.NumberOfLeaves) { throw new ArgumentException("The two trees are not comparable"); } Dictionary <int, int> nodeID2ListIndex = new Dictionary <int, int>(treeB.NumberOfNodes); for (int i = 0; i < treeB.NumberOfNodes; ++i) { nodeID2ListIndex[treeB.Nodes[i].ID] = i; } BinaryGuideTreeNode node, nodeB; for (int i = treeA.NumberOfLeaves; i < treeA.NumberOfNodes; ++i) { node = treeA.Nodes[i]; if (node.LeftChildren.NeedReAlignment == true || node.RightChildren.NeedReAlignment == true) { node.NeedReAlignment = true; } else { if (!nodeID2ListIndex.ContainsKey(node.LeftChildren.ID) || !nodeID2ListIndex.ContainsKey(node.RightChildren.ID)) { node.NeedReAlignment = true; } else { nodeB = treeB.Nodes[nodeID2ListIndex[node.LeftChildren.ID]].Parent; try { if (nodeB.LeftChildren.ID == node.RightChildren.ID || nodeB.RightChildren.ID == node.RightChildren.ID) { node.NeedReAlignment = false; node.ID = nodeB.ID; } else { node.NeedReAlignment = true; } } catch (NullReferenceException) { node.NeedReAlignment = true; } } } } }
/// <summary> /// Main pregressive alignment algorithm aligns a set of sequences guided by /// a binary tree. /// </summary> /// <param name="sequences">input sequences</param> /// <param name="tree">a binary guide tree</param> public void Align(IList <ISequence> sequences, BinaryGuideTree tree) { SequenceWeighting sequenceWeighting = null; if (PAMSAMMultipleSequenceAligner.UseWeights) { sequenceWeighting = new SequenceWeighting(tree); /* * for (int i = 0; i < sequenceWeighting.Weights.Length; ++i) * { * sequenceWeighting.Weights[i] = 1; * } */ } if (sequences.Count == 0) { throw new ArgumentException("Empty set of sequences"); } IAlphabet alphabet = sequences[0].Alphabet; Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, i => { if (sequences[i].Alphabet != alphabet) { throw new ArgumentException("Inconsistent sequence alphabet"); } }); if (PAMSAMMultipleSequenceAligner.UseWeights) { // Generate profile for leaf nodes Parallel.For(0, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, i => { tree.Nodes[i].ProfileAlignment = ProfileAlignment.GenerateProfileAlignment(sequences[i], sequenceWeighting.Weights[i]); tree.Nodes[i].Weight = sequenceWeighting.Weights[i]; }); } else { // Generate profile for leaf nodes Parallel.For(0, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, i => { tree.Nodes[i].ProfileAlignment = ProfileAlignment.GenerateProfileAlignment(sequences[i]); }); } // Iterate internal nodes; // as defined in the tree, the last node is the root for (int i = sequences.Count; i < tree.Nodes.Count; ++i) { if (tree.Nodes[i].NeedReAlignment) { // pull out its children _nodeA = tree.Nodes[i].LeftChildren; _nodeB = tree.Nodes[i].RightChildren; if (PAMSAMMultipleSequenceAligner.UseWeights) { _profileAligner.Weights = new float[2]; _profileAligner.Weights[0] = _nodeA.Weight; _profileAligner.Weights[1] = _nodeB.Weight; tree.Nodes[i].Weight = _nodeA.Weight + _nodeB.Weight; } // align two profiles ProfileAlignment result = null; if (_nodeA.ProfileAlignment.NumberOfSequences < _nodeB.ProfileAlignment.NumberOfSequences) { result = (ProfileAlignment)_profileAligner.Align( _nodeA.ProfileAlignment, _nodeB.ProfileAlignment); // assign aligned profiles to the current internal node tree.Nodes[i].ProfileAlignment = result; // generate eString for the children nodes _nodeA.EString = _profileAligner.GenerateEString(_profileAligner.AlignedA); _nodeB.EString = _profileAligner.GenerateEString(_profileAligner.AlignedB); } else { result = (ProfileAlignment)_profileAligner.Align( _nodeB.ProfileAlignment, _nodeA.ProfileAlignment); // assign aligned profiles to the current internal node tree.Nodes[i].ProfileAlignment = result; // generate eString for the children nodes _nodeA.EString = _profileAligner.GenerateEString(_profileAligner.AlignedB); _nodeB.EString = _profileAligner.GenerateEString(_profileAligner.AlignedA); } // children node profiles can be deleted _nodeA.ProfileAlignment.Clear(); _nodeB.ProfileAlignment.Clear(); } } // Convert original unaligned sequences to aligned ones by applying alignment paths in eStrings try { _alignedSequences = new List <ISequence>(sequences.Count); } catch (OutOfMemoryException ex) { throw new Exception("Out of memory", ex.InnerException); } for (int i = 0; i < sequences.Count; ++i) { _alignedSequences.Add(null); } Parallel.For(0, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, i => { ISequence seq = sequences[i]; BinaryGuideTreeNode node; node = tree.Nodes[i]; while (!node.IsRoot) { seq = _profileAligner.GenerateSequenceFromEString(node.EString, seq); node = node.Parent; } _alignedSequences[i] = seq; }); }
/// <summary> /// Cut a tree at an edge to generate 2 subtrees /// </summary> /// <param name="edgeIndex">zero-based edge index</param> /// <returns>return[0] is the subtree with the same root as the original tree; /// return[1] is the subtree rooted below the cutting edge</returns> public BinaryGuideTree[] CutTree(int edgeIndex) { if (edgeIndex < 0 || edgeIndex >= _edges.Count) { throw new ArgumentException(string.Format("The edge ID provided when cutting the binary tree was not available. Given edge ID: {0}, available edges: {1}", edgeIndex, _edges.Count)); } if (_edges[edgeIndex].ChildNode == null) { throw new Exception("The edge specified was not properly extended to a child node.Edge ID: " + edgeIndex); } _edges[edgeIndex].ChildNode.Parent = null; if (_edges[edgeIndex].ParentNode.LeftChildren.ID == _edges[edgeIndex].ChildNode.ID) { _edges[edgeIndex].ParentNode.LeftChildren = null; } else { _edges[edgeIndex].ParentNode.RightChildren = null; } // generate two new trees BinaryGuideTree treeA = new BinaryGuideTree(_root); BinaryGuideTree treeB = new BinaryGuideTree(_edges[edgeIndex].ChildNode); treeA.NumberOfNodes = _numberOfNodes; treeB.NumberOfNodes = _numberOfNodes; treeA.NumberOfLeaves = _numberOfLeaves; treeB.NumberOfLeaves = _numberOfLeaves; treeA.Nodes = _nodes; treeA.Edges = _edges; treeB.Nodes = _nodes; treeB.Edges = _edges; // pull the subtree nodes out for the two new roots treeA.Nodes = (List <BinaryGuideTreeNode>)ExtractSubTreeNodes(treeA.Root); treeB.Nodes = (List <BinaryGuideTreeNode>)ExtractSubTreeNodes(treeB.Root); treeA.NumberOfNodes = treeA.Nodes.Count; treeB.NumberOfNodes = treeB.Nodes.Count; treeA.NumberOfLeaves = 0; treeB.NumberOfLeaves = 0; for (int i = 0; i < treeA.Nodes.Count; ++i) { if (treeA.Nodes[i].IsLeaf) { ++treeA.NumberOfLeaves; } } for (int i = 0; i < treeB.Nodes.Count; ++i) { if (treeB.Nodes[i].IsLeaf) { ++treeB.NumberOfLeaves; } } return(new BinaryGuideTree[2] { treeA, treeB }); }
/// <summary> /// Performs Stage 1, 2, and 3 as described in class description. /// </summary> /// <param name="sequences">input unaligned sequences</param> public IList <MBF.Algorithms.Alignment.ISequenceAlignment> Align(IList <ISequence> sequences) { // Initializations if (sequences.Count > 0) { if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(sequences[0].Alphabet); } else { ConsensusResolver.SequenceAlphabet = sequences[0].Alphabet; } } // Get ProfileAligner ready IProfileAligner profileAligner = null; switch (_profileAlignerName) { case (ProfileAlignerNames.NeedlemanWunschProfileAligner): if (_degreeOfParallelism == 1) { profileAligner = new NeedlemanWunschProfileAlignerSerial( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } else { profileAligner = new NeedlemanWunschProfileAlignerParallel( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } break; case (ProfileAlignerNames.SmithWatermanProfileAligner): if (_degreeOfParallelism == 1) { profileAligner = new SmithWatermanProfileAlignerSerial( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } else { profileAligner = new SmithWatermanProfileAlignerParallel( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } break; default: throw new ArgumentException("Invalid profile aligner name"); } _alignedSequences = new List <ISequence>(sequences.Count); float currentScore = 0; // STAGE 1 Performance.Snapshot("Stage 1"); // Generate DistanceMatrix KmerDistanceMatrixGenerator kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, _kmerLength, _moleculeType, _distanceFunctionName); // Hierarchical clustering IHierarchicalClustering hierarcicalClustering = new HierarchicalClusteringParallel (kmerDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName); // Generate Guide Tree BinaryGuideTree binaryGuideTree = new BinaryGuideTree(hierarcicalClustering); // Progressive Alignment IProgressiveAligner progressiveAlignerA = new ProgressiveAligner(profileAligner); progressiveAlignerA.Align(sequences, binaryGuideTree); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerA.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > _alignmentScoreA) { _alignmentScoreA = currentScore; _alignedSequencesA = progressiveAlignerA.AlignedSequences; } if (_alignmentScoreA > _alignmentScore) { _alignmentScore = _alignmentScoreA; _alignedSequences = _alignedSequencesA; } if (PAMSAMMultipleSequenceAligner.FasterVersion) { _alignedSequencesB = _alignedSequencesA; _alignedSequencesC = _alignedSequencesA; _alignmentScoreB = _alignmentScoreA; _alignmentScoreC = _alignmentScoreA; } else { BinaryGuideTree binaryGuideTreeB = null; IHierarchicalClustering hierarcicalClusteringB = null; KimuraDistanceMatrixGenerator kimuraDistanceMatrixGenerator = new KimuraDistanceMatrixGenerator(); if (PAMSAMMultipleSequenceAligner.UseStageB) { // STAGE 2 Performance.Snapshot("Stage 2"); // Generate DistanceMatrix from Multiple Sequence Alignment int iterateTime = 0; while (true) { ++iterateTime; kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(_alignedSequences); // Hierarchical clustering hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName); // Generate Guide Tree binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); BinaryGuideTree.CompareTwoTrees(binaryGuideTreeB, binaryGuideTree); binaryGuideTree = binaryGuideTreeB; // Progressive Alignment IProgressiveAligner progressiveAlignerB = new ProgressiveAligner(profileAligner); progressiveAlignerB.Align(sequences, binaryGuideTreeB); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerB.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > _alignmentScoreB) { _alignmentScoreB = currentScore; _alignedSequencesB = progressiveAlignerB.AlignedSequences; break; } else { break; } } if (_alignmentScoreB > _alignmentScore) { _alignmentScore = _alignmentScoreB; _alignedSequences = _alignedSequencesB; } } else { binaryGuideTreeB = binaryGuideTree; } // STAGE 3 Performance.Snapshot("Stage 3"); // refinement //int maxRefineMentTime = sequences.Count * 2 - 2; int maxRefineMentTime = 1; if (sequences.Count == 2) { maxRefineMentTime = 0; } int refinementTime = 0; _alignedSequencesC = new List <ISequence>(sequences.Count); for (int i = 0; i < sequences.Count; ++i) { _alignedSequencesC.Add(new Sequence(_alphabet, _alignedSequences[i].ToString())); } List <int>[] leafNodeIndices = null; List <int>[] allIndelPositions = null; IProfileAlignment[] separatedProfileAlignments = null; List <int>[] eStrings = null; while (refinementTime < maxRefineMentTime) { ++refinementTime; Performance.Snapshot("Refinement iter " + refinementTime.ToString()); bool needRefinement = false; for (int edgeIndex = 0; edgeIndex < binaryGuideTreeB.NumberOfEdges; ++edgeIndex) { leafNodeIndices = binaryGuideTreeB.SeparateSequencesByCuttingTree(edgeIndex); allIndelPositions = new List <int> [2]; separatedProfileAlignments = ProfileAlignment.ProfileExtraction(_alignedSequencesC, leafNodeIndices[0], leafNodeIndices[1], out allIndelPositions); eStrings = new List <int> [2]; if (separatedProfileAlignments[0].NumberOfSequences < separatedProfileAlignments[1].NumberOfSequences) { profileAligner.Align(separatedProfileAlignments[0], separatedProfileAlignments[1]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedA); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedB); } else { profileAligner.Align(separatedProfileAlignments[1], separatedProfileAlignments[0]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedB); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedA); } for (int set = 0; set < 2; ++set) { Parallel.ForEach(leafNodeIndices[set], PAMSAMMultipleSequenceAligner.parallelOption, i => { Sequence seq = new Sequence(_alphabet, ""); seq.IsReadOnly = false; int indexAllIndel = 0; for (int j = 0; j < _alignedSequencesC[i].Count; ++j) { if (indexAllIndel < allIndelPositions[set].Count && j == allIndelPositions[set][indexAllIndel]) { ++indexAllIndel; } else { seq.Add(_alignedSequencesC[i][j]); } } seq = profileAligner.GenerateSequenceFromEString(eStrings[set], seq); seq.IsReadOnly = true; _alignedSequencesC[i] = seq; }); } currentScore = MsaUtils.MultipleAlignmentScoreFunction(_alignedSequencesC, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > _alignmentScoreC) { _alignmentScoreC = currentScore; needRefinement = true; // recreate the tree kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(_alignedSequencesC); hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName); binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); break; } } if (!needRefinement) { refinementTime = maxRefineMentTime; break; } } if (_alignmentScoreC > _alignmentScore) { _alignmentScore = _alignmentScoreC; _alignedSequences = _alignedSequencesC; } Performance.Snapshot("Stop Stage 3"); } //just for the purpose of integrating PW and MSA with the same output IList <MBF.Algorithms.Alignment.ISequenceAlignment> results = new List <MBF.Algorithms.Alignment.ISequenceAlignment>(); return(results); }