protected void CalculateResidueWeights(char[] residues, ref SubstitutionMatrix.SubstitutionMatrix subMatrix) { double maxScore; Dictionary <char, double[]> weighting; CalculateResidueWeightsCommon(residues, out maxScore, out weighting); // Clustal does some things by looking up the gaps in the substitution matrix, but it appears // that it will always be equal to 0. for (int i = 0; i < profileLength; i++) { if (gapPositions[i] < profileMacromolecules.Count) // If at least one sequence has a residue here. { double scale = (double)(profileMacromolecules.Count - gapPositions[i]) / (double)(profileMacromolecules.Count); foreach (char residue1 in residues) { double score = 0; foreach (char residue2 in residues) { score += weighting[residue2][i] * subMatrix[residue2][residue1]; } residueWeights[residue1][i + 1] = (score / maxScore) * scale; } double oldGapScore = 0; // though if these really are always 0, I should just get rid of them or they're pointless operations. double newGapScore = 0; foreach (char residue in residues) { oldGapScore += weighting[residue][i] * subMatrix[residue][Routines.oldGap]; newGapScore += weighting[residue][i] * subMatrix[residue][Routines.newGap]; } residueWeights[Routines.oldGap][i + 1] = (oldGapScore / maxScore) * scale; residueWeights[Routines.newGap][i + 1] = (newGapScore / maxScore) * scale; } } }
char[] residueCodes = Routines.nucleotideCodesPlusGaps; // should be one with gaps, need to do programatically public double Align(ref Alignment alignmentObject, SubstitutionMatrix.SubstitutionMatrixSeries subMatrixClass, Tuple <ReadOnlyCollection <AlignedMacromolecule>, ReadOnlyCollection <AlignedMacromolecule>, double> step) { bool switchGroups; ReadOnlyCollection <AlignedMacromolecule> group1; ReadOnlyCollection <AlignedMacromolecule> group2; double meanSimilarity = step.Item3; int numSeqs1 = step.Item1.Count(); int numSeqs2 = step.Item2.Count(); int numSeqs = alignmentObject.NumberMacromolecules; if (numSeqs1 == 0 || numSeqs2 == 0) // If one of the groups has no alignable sequences, can't do anything! { return(0); } // Make the group with the most sequences group 1 // Figure out the structure penalties here if (switchGroups = numSeqs2 > numSeqs1) { group1 = step.Item2; group2 = step.Item1; } else { group1 = step.Item1; group2 = step.Item2; } // Make the first group int group1MaxLength = 0; foreach (AlignedMacromolecule macromolecule in group1) { group1MaxLength = Math.Max(group1MaxLength, macromolecule.AlignedPositions.Last() + 1); } // Make the second group int group2MaxLength = 0; foreach (AlignedMacromolecule macromolecule in group2) { group2MaxLength = Math.Max(group2MaxLength, macromolecule.AlignedPositions.Last() + 1); } maxLength = group1MaxLength + group2MaxLength + 2; // Clustal sets the Alignment parameter here. displ = new int[maxLength + 1]; alnPath1 = new int[maxLength + 1]; alnPath2 = new int[maxLength + 1]; // Calculate the real length of profiles, removing gaps: int group1NoGapsLength = 0; foreach (AlignedMacromolecule macromolecule in group1) { group1NoGapsLength += macromolecule.Sequence.Length; } group1NoGapsLength = (int)(group1NoGapsLength / (double)numSeqs1); int group2NoGapsLength = 0; foreach (AlignedMacromolecule macromolecule in group2) { group2NoGapsLength += macromolecule.Sequence.Length; } group2NoGapsLength = (int)(group2NoGapsLength / (double)numSeqs2); int minNoGapsLength = Math.Min(group1NoGapsLength, group2NoGapsLength); int maxNoGapsLength = Math.Max(group1NoGapsLength, group2NoGapsLength); // I'm just going to reproduce the Clustal code directly. May recode later once I fully understand what it does. // There is quite literally no point to the scaleVals.scale value, and no point to setting this here. // Tuple<double, double> scaleVals = Tuple.Create(1.0, 100.0); // scaleVals.scale and scaleVals.intScale, respectively. double gapOpeningCoefficient, gapExtensionCoefficient; double gapOpenPenalty = 10.0; // Will be user selectable parameter double gapExtendPenalty = 0.2; // Will be user selectable parameter //Will be user option bool useNegative = false; SubstitutionMatrix.SubstitutionMatrix subMatrix = subMatrixClass.GetMatrix(meanSimilarity, minNoGapsLength, useNegative); double gapPenaltyScaleFactor = subMatrixClass.GetScaleFactor(meanSimilarity, useNegative); // Then it sets DNA vs. Protein parameters if (alignmentObject.IsNucleicAcid) { gapOpeningCoefficient = 100.0 * gapOpenPenalty * gapPenaltyScaleFactor; gapExtensionCoefficient = 100.0 * gapExtendPenalty * gapPenaltyScaleFactor; } else { gapOpeningCoefficient = CalculateProteinGapOpeningCoefficient(gapOpenPenalty, group1NoGapsLength, group2NoGapsLength, subMatrix.AverageScore, gapPenaltyScaleFactor); gapExtensionCoefficient = 100.0 * gapExtendPenalty; } // We need one profile with substitution matrix information and one without! // But this will change when we have the LE scoring function. (Whatever that is.) // Calculate the profile arrays. The first group, but not the second, incorporates substitution information. group1Profile = Profile.Calculate(group1, group1MaxLength, gapOpeningCoefficient, gapExtensionCoefficient, subMatrix); group2Profile = Profile.Calculate(group2, group2MaxLength, gapOpeningCoefficient, gapExtensionCoefficient, null); //new SubstitutionMatrix()); // User Myers and Miller to align the two sequences double score = ProgDiff(0, 0, group1MaxLength, group2MaxLength, group1Profile.GapOpeningPenalties[0], group1Profile.GapExtensionPenalties[0]); alignmentLength = ProgTracepath(); addGGaps(); group1MaxLength = alignmentLength; if (true) // Clustal: DoRemoveFirstIteration() == TREE ??? { // Combine the sequences and submit for iterations } return(0); }
public static Profile Calculate(ReadOnlyCollection <AlignedMacromolecule> macromolecules, int length, double gapOpeningCoefficient, double gapExtensionCoefficient, SubstitutionMatrix.SubstitutionMatrix subMatrix) { // For the moment, ignoring struct penalties and gap penalty masks char[] residues; // Configure parameters bool isNucleicAcid = macromolecules[0].IsNucleicAcid; // More global parameter to access? if (isNucleicAcid) { residues = Routines.nucleotideCodesPlusGaps; } else { residues = Routines.aminoAcidCodesPlusGaps; } Profile profile = new Profile(length, residues, macromolecules); profile.CountGaps(); profile.CalculateGapPenalties(gapOpeningCoefficient, gapExtensionCoefficient); if (subMatrix == null) // The substitution matrix is empty { profile.CalculateResidueWeights(residues); } else { profile.CalculateResidueWeights(residues, ref subMatrix); } return(profile); }