Example #1
0
        public static void AlignSequences(ref Alignment alignmentObject, SubstitutionMatrix.SubstitutionMatrixSeries subMatrix, ref double[,] similarityMatrix, ReadOnlyCollection <Tree.GuideTree <AlignedMacromolecule> .TreeLeaf> leavesList, ReadOnlyCollection <Tuple <ReadOnlyCollection <AlignedMacromolecule>, ReadOnlyCollection <AlignedMacromolecule>, double> > steps)
        {
            IMultipleAlignmentAlgorithm alignmentAlgorithm = new MyersMillerProfileAlign();
            int numSteps = steps.Count();

            // Here, Clustal checks the similarity matrix and looks up the most closely-related sequence
            // for each sequence.
            // But it could have used the distance matrix and found the minimum distance. Consider switching to avoid unnecessary calculations

            foreach (Tuple <ReadOnlyCollection <AlignedMacromolecule>, ReadOnlyCollection <AlignedMacromolecule>, double> step in steps)
            {
                // Here, Clustal would check to make sure that at least one pair of sequences isn't too divergent.
                // It also records which ones are too divergent so that the alignment doesn't use them.

                double score = alignmentAlgorithm.Align(ref alignmentObject, subMatrix, step);
            }
        }
Example #2
0
        char[] residueCodes   = Routines.nucleotideCodesPlusGaps; // should be one with gaps, need to do programatically

        public double Align(ref Alignment alignmentObject, SubstitutionMatrix.SubstitutionMatrixSeries subMatrixClass, Tuple <ReadOnlyCollection <AlignedMacromolecule>, ReadOnlyCollection <AlignedMacromolecule>, double> step)
        {
            bool switchGroups;
            ReadOnlyCollection <AlignedMacromolecule> group1;
            ReadOnlyCollection <AlignedMacromolecule> group2;
            double meanSimilarity = step.Item3;

            int numSeqs1 = step.Item1.Count();
            int numSeqs2 = step.Item2.Count();
            int numSeqs  = alignmentObject.NumberMacromolecules;

            if (numSeqs1 == 0 || numSeqs2 == 0)
            // If one of the groups has no alignable sequences, can't do anything!
            {
                return(0);
            }


            // Make the group with the most sequences group 1
            // Figure out the structure penalties here
            if (switchGroups = numSeqs2 > numSeqs1)
            {
                group1 = step.Item2;
                group2 = step.Item1;
            }
            else
            {
                group1 = step.Item1;
                group2 = step.Item2;
            }

            // Make the first group
            int group1MaxLength = 0;

            foreach (AlignedMacromolecule macromolecule in group1)
            {
                group1MaxLength = Math.Max(group1MaxLength, macromolecule.AlignedPositions.Last() + 1);
            }


            // Make the second group
            int group2MaxLength = 0;

            foreach (AlignedMacromolecule macromolecule in group2)
            {
                group2MaxLength = Math.Max(group2MaxLength, macromolecule.AlignedPositions.Last() + 1);
            }

            maxLength = group1MaxLength + group2MaxLength + 2; // Clustal sets the Alignment parameter here.
            displ     = new int[maxLength + 1];
            alnPath1  = new int[maxLength + 1];
            alnPath2  = new int[maxLength + 1];


            // Calculate the real length of profiles, removing gaps:

            int group1NoGapsLength = 0;

            foreach (AlignedMacromolecule macromolecule in group1)
            {
                group1NoGapsLength += macromolecule.Sequence.Length;
            }
            group1NoGapsLength = (int)(group1NoGapsLength / (double)numSeqs1);

            int group2NoGapsLength = 0;

            foreach (AlignedMacromolecule macromolecule in group2)
            {
                group2NoGapsLength += macromolecule.Sequence.Length;
            }
            group2NoGapsLength = (int)(group2NoGapsLength / (double)numSeqs2);

            int minNoGapsLength = Math.Min(group1NoGapsLength, group2NoGapsLength);
            int maxNoGapsLength = Math.Max(group1NoGapsLength, group2NoGapsLength);

            // I'm just going to reproduce the Clustal code directly. May recode later once I fully understand what it does.

            // There is quite literally no point to the scaleVals.scale value, and no point to setting this here.
            // Tuple<double, double> scaleVals = Tuple.Create(1.0, 100.0); // scaleVals.scale and scaleVals.intScale, respectively.

            double gapOpeningCoefficient, gapExtensionCoefficient;
            double gapOpenPenalty   = 10.0; // Will be user selectable parameter
            double gapExtendPenalty = 0.2;  // Will be user selectable parameter

            //Will be user option
            bool useNegative = false;

            SubstitutionMatrix.SubstitutionMatrix subMatrix = subMatrixClass.GetMatrix(meanSimilarity, minNoGapsLength, useNegative);

            double gapPenaltyScaleFactor = subMatrixClass.GetScaleFactor(meanSimilarity, useNegative);

            // Then it sets DNA vs. Protein parameters
            if (alignmentObject.IsNucleicAcid)
            {
                gapOpeningCoefficient   = 100.0 * gapOpenPenalty * gapPenaltyScaleFactor;
                gapExtensionCoefficient = 100.0 * gapExtendPenalty * gapPenaltyScaleFactor;
            }
            else
            {
                gapOpeningCoefficient   = CalculateProteinGapOpeningCoefficient(gapOpenPenalty, group1NoGapsLength, group2NoGapsLength, subMatrix.AverageScore, gapPenaltyScaleFactor);
                gapExtensionCoefficient = 100.0 * gapExtendPenalty;
            }

            // We need one profile with substitution matrix information and one without!
            // But this will change when we have the LE scoring function. (Whatever that is.)

            // Calculate the profile arrays. The first group, but not the second, incorporates substitution information.
            group1Profile = Profile.Calculate(group1, group1MaxLength, gapOpeningCoefficient, gapExtensionCoefficient, subMatrix);
            group2Profile = Profile.Calculate(group2, group2MaxLength, gapOpeningCoefficient, gapExtensionCoefficient, null); //new SubstitutionMatrix());

            // User Myers and Miller to align the two sequences
            double score = ProgDiff(0, 0, group1MaxLength, group2MaxLength, group1Profile.GapOpeningPenalties[0], group1Profile.GapExtensionPenalties[0]);

            alignmentLength = ProgTracepath();

            addGGaps();

            group1MaxLength = alignmentLength;

            if (true) // Clustal: DoRemoveFirstIteration() == TREE ???
            {
                // Combine the sequences and submit for iterations
            }



            return(0);
        }