public static void AlignSequences(ref Alignment alignmentObject, SubstitutionMatrix.SubstitutionMatrixSeries subMatrix, ref double[,] similarityMatrix, ReadOnlyCollection <Tree.GuideTree <AlignedMacromolecule> .TreeLeaf> leavesList, ReadOnlyCollection <Tuple <ReadOnlyCollection <AlignedMacromolecule>, ReadOnlyCollection <AlignedMacromolecule>, double> > steps) { IMultipleAlignmentAlgorithm alignmentAlgorithm = new MyersMillerProfileAlign(); int numSteps = steps.Count(); // Here, Clustal checks the similarity matrix and looks up the most closely-related sequence // for each sequence. // But it could have used the distance matrix and found the minimum distance. Consider switching to avoid unnecessary calculations foreach (Tuple <ReadOnlyCollection <AlignedMacromolecule>, ReadOnlyCollection <AlignedMacromolecule>, double> step in steps) { // Here, Clustal would check to make sure that at least one pair of sequences isn't too divergent. // It also records which ones are too divergent so that the alignment doesn't use them. double score = alignmentAlgorithm.Align(ref alignmentObject, subMatrix, step); } }
char[] residueCodes = Routines.nucleotideCodesPlusGaps; // should be one with gaps, need to do programatically public double Align(ref Alignment alignmentObject, SubstitutionMatrix.SubstitutionMatrixSeries subMatrixClass, Tuple <ReadOnlyCollection <AlignedMacromolecule>, ReadOnlyCollection <AlignedMacromolecule>, double> step) { bool switchGroups; ReadOnlyCollection <AlignedMacromolecule> group1; ReadOnlyCollection <AlignedMacromolecule> group2; double meanSimilarity = step.Item3; int numSeqs1 = step.Item1.Count(); int numSeqs2 = step.Item2.Count(); int numSeqs = alignmentObject.NumberMacromolecules; if (numSeqs1 == 0 || numSeqs2 == 0) // If one of the groups has no alignable sequences, can't do anything! { return(0); } // Make the group with the most sequences group 1 // Figure out the structure penalties here if (switchGroups = numSeqs2 > numSeqs1) { group1 = step.Item2; group2 = step.Item1; } else { group1 = step.Item1; group2 = step.Item2; } // Make the first group int group1MaxLength = 0; foreach (AlignedMacromolecule macromolecule in group1) { group1MaxLength = Math.Max(group1MaxLength, macromolecule.AlignedPositions.Last() + 1); } // Make the second group int group2MaxLength = 0; foreach (AlignedMacromolecule macromolecule in group2) { group2MaxLength = Math.Max(group2MaxLength, macromolecule.AlignedPositions.Last() + 1); } maxLength = group1MaxLength + group2MaxLength + 2; // Clustal sets the Alignment parameter here. displ = new int[maxLength + 1]; alnPath1 = new int[maxLength + 1]; alnPath2 = new int[maxLength + 1]; // Calculate the real length of profiles, removing gaps: int group1NoGapsLength = 0; foreach (AlignedMacromolecule macromolecule in group1) { group1NoGapsLength += macromolecule.Sequence.Length; } group1NoGapsLength = (int)(group1NoGapsLength / (double)numSeqs1); int group2NoGapsLength = 0; foreach (AlignedMacromolecule macromolecule in group2) { group2NoGapsLength += macromolecule.Sequence.Length; } group2NoGapsLength = (int)(group2NoGapsLength / (double)numSeqs2); int minNoGapsLength = Math.Min(group1NoGapsLength, group2NoGapsLength); int maxNoGapsLength = Math.Max(group1NoGapsLength, group2NoGapsLength); // I'm just going to reproduce the Clustal code directly. May recode later once I fully understand what it does. // There is quite literally no point to the scaleVals.scale value, and no point to setting this here. // Tuple<double, double> scaleVals = Tuple.Create(1.0, 100.0); // scaleVals.scale and scaleVals.intScale, respectively. double gapOpeningCoefficient, gapExtensionCoefficient; double gapOpenPenalty = 10.0; // Will be user selectable parameter double gapExtendPenalty = 0.2; // Will be user selectable parameter //Will be user option bool useNegative = false; SubstitutionMatrix.SubstitutionMatrix subMatrix = subMatrixClass.GetMatrix(meanSimilarity, minNoGapsLength, useNegative); double gapPenaltyScaleFactor = subMatrixClass.GetScaleFactor(meanSimilarity, useNegative); // Then it sets DNA vs. Protein parameters if (alignmentObject.IsNucleicAcid) { gapOpeningCoefficient = 100.0 * gapOpenPenalty * gapPenaltyScaleFactor; gapExtensionCoefficient = 100.0 * gapExtendPenalty * gapPenaltyScaleFactor; } else { gapOpeningCoefficient = CalculateProteinGapOpeningCoefficient(gapOpenPenalty, group1NoGapsLength, group2NoGapsLength, subMatrix.AverageScore, gapPenaltyScaleFactor); gapExtensionCoefficient = 100.0 * gapExtendPenalty; } // We need one profile with substitution matrix information and one without! // But this will change when we have the LE scoring function. (Whatever that is.) // Calculate the profile arrays. The first group, but not the second, incorporates substitution information. group1Profile = Profile.Calculate(group1, group1MaxLength, gapOpeningCoefficient, gapExtensionCoefficient, subMatrix); group2Profile = Profile.Calculate(group2, group2MaxLength, gapOpeningCoefficient, gapExtensionCoefficient, null); //new SubstitutionMatrix()); // User Myers and Miller to align the two sequences double score = ProgDiff(0, 0, group1MaxLength, group2MaxLength, group1Profile.GapOpeningPenalties[0], group1Profile.GapExtensionPenalties[0]); alignmentLength = ProgTracepath(); addGGaps(); group1MaxLength = alignmentLength; if (true) // Clustal: DoRemoveFirstIteration() == TREE ??? { // Combine the sequences and submit for iterations } return(0); }