Ejemplo n.º 1
0
        static double CalculatePercentIdentity(AlignedMacromolecule macromolecule1, AlignedMacromolecule macromolecule2)
        {
            double percentIdentity = 0;
            int    matched         = 0;

            for (int i = 0, j = 0; i < macromolecule1.Sequence.Length && j < macromolecule2.Sequence.Length; i++, j++)
            {
                bool endFound = false;

                if (macromolecule1.AlignedPositions[i] - macromolecule2.AlignedPositions[j] != 0)
                {
                    endFound = FindNextAlignedPositions(ref i, ref j, macromolecule1.AlignedPositions, macromolecule2.AlignedPositions);
                }

                if (!endFound && macromolecule1.Sequence[i] == macromolecule2.Sequence[j])
                {
                    matched++;
                }
            }

            percentIdentity = 100 * (double)matched / (double)macromolecule1.Sequence.Length;

            return(percentIdentity);
        }
Ejemplo n.º 2
0
        public void DoRemoveFirstIteration(List <AlignedMacromolecule> macromolecules)
        {
            // Remove-first iteration strategy
            // Optimize the alignment score by progressively removing sequences
            // each time a sequence is removed, remove all-gap columns and do the profileAlignment again
            // if the fit is better, keep the new alignment. If not, discard it.


            // Rewritten:
            // 1. "Remove" a sequence from the profile by placing it in the first position
            //    We can do without this because we'll be using lists.
            // 2. Remove any gap-only columns from both profiles
            // 3. Calculate a simple distance matrix (find the percent identity and then convert to proportion nonidentical)
            // 4. Clustal here uses a temporary tree file, but unclear what it is doing with it.
            // 5. Call GetWeightsFromProfileAlign from the Tree Interface
            //    This function takes the alignment, the simple distance matrix, the profile tree names,
            //    the profile weights, and "false" for useTree1 and useTree2
            //    a. call GenerateTree(distanceMatrix) for each profile
            //    b. generate a new MultipleAlignment object
            //    c. Call MultipleAlignment.CalcPairwiseForProfileAlign
            //       i. uses MyersMillerProfileAlign
            //

            // Oh wait, in this case, profile1 is the removed sequence, profile2 is the rest

            //*****************
            // Okay, here's what needs to happen.
            // I need to remove a macromolecule from the list and then remove all-gap columns from both.
            // Then I need to calculate a simple distance matrix for them, get their weights,
            // and then do an alignment on them and compare the new score to the old score.
            // ---> If the new score is better, save this state and score value.
            // Now we start fresh from the original alignment and remove the 2nd macromolecule...
            //
            // So I can loop through the main list of macromolecules
            // I will also maintain a second list of the same that I will remove and add
            // from each cycle.
            // Then within the loop I will make copies of everything and process them.
            // The copies are saved as "best so far" if the score improves.
            // If the score doesn't improve, they are left untouched.
            //
            // What I can't tell though is what is being saved between iterations that
            // makes it an iteration, rather than just a repeat.
            //     So this module receives a new Alignment object which is shared across iterations,
            //     Note that the alignments are "reset" prior to the MSA
            // It *seems* like the strategy should be:
            //     if removing one improves, then try again up to the number of iterations
            //     Alternatively, it might actaully mean removing each of the sequences in turn
            //     When no more improvement occurs, it stops and reverts to the alignment from
            //     the best-scoring round. The latter appears to make the most sense.
            //
            // So really, I shouldn't be starting fresh each round.
            // Which means its okay to add back the altered removed sequence, but how to
            // keep track of which are being removed?
            // Well as long as I'm not altering the main collection, the details will just
            // be the thing to change.
            // And I'll only need to make copies of the current alignment when I find a new best score.



            int iterations = 3; // will be a userparameter

            // First, make a copy of the alignment part of the macromolecules:
            List <AlignedMacromolecule> activeMacromolecules = new List <AlignedMacromolecule>();

            foreach (AlignedMacromolecule macromolecule in macromolecules)
            {
                activeMacromolecules.Add(macromolecule.Copy());
            }

            // Store the total macromolecules for the enumerator
            AlignedMacromolecule[] allMacromolecules = new AlignedMacromolecule[activeMacromolecules.Count];
            activeMacromolecules.CopyTo(allMacromolecules);

            bool improved = false; // Indicates whether any improvement has been made

            for (int i = 0; i < iterations; i++)
            {
                bool improvedThisIteration = false; // Indicates whether any improvement has been made in the current iteration
                foreach (AlignedMacromolecule removedMacromolecule in activeMacromolecules)
                {
                    activeMacromolecules.Remove(removedMacromolecule);

                    // Remove gaps from the removed sequence and gaps made unnecessary by the removal in the rest.
                    Alignment.RemoveRedundantGaps(activeMacromolecules);
                    removedMacromolecule.ClearGaps();

                    // Calculate simple distance matrix
                    int numSequences = macromolecules.Count;
                    double[,] distanceMatrix = new double[numSequences, numSequences];
                    for (int j = 0; j < macromolecules.Count; j++)
                    {
                        for (int k = 0; k < macromolecules.Count; k++)
                        {
                            double percentIdentity = CalculatePercentIdentity(allMacromolecules[j], allMacromolecules[k]);
                            distanceMatrix[j, k] = (100.0 - percentIdentity) / 100.0;
                        }
                    }

                    // Calculate weights

                    // Here Clustal calls TreeInterface.GetWeightsForProfileAlign(Alignment, DistMatrix, treeName1, treeWeights1, treeName2, treeWeights2, numSeqs, profile1_numSeqs, useTree1, useTree2, success)
                    //     doesn't itself do anything but call next subroutine
                    //     --> GetWeightsForProfileAlignNJ(same as above)
                    //         if (!useTree1 && profile1_numSeqs >= 2) --> GenerateTreeFromDistMatNJ(DistMatrix, Alignment, Profile1_sequences, treeName1, success)
                    //         if (!useTree2 && profiel2_numSeqs >= 2) --> GenerateTreeFromDistMatNJ(DistMatrix, Alignment, Profile2_sequences, treeName2, success)
                    //         --> MSA.CalcPairwiseForProfileAlign(Alignment, DistMatrix)
                    //             Does a few things and then
                    //             --> MyersMillerProfileAlign.ProfileAlign -- this would be the normal one
                    //             And wraps up
                    //         if (profile1_numSeqs >= 2) --> Tree1.ReadTree
                    //         --> Tree1.CalcSeqWeights
                    //         if (profile2_numSeqs >= 2) --> Tree2.ReadTree
                    //         --> Tree2.CalcSeqWeights
                    //         convert distances to similarities

                    // Tree.GuideTree<AlignedMacromolecule> tree = Tree.GuideTree<AlignedMacromolecule>.GetWeights(ref distanceMatrix, allMacromolecules);
                    // MultipleAlignment.PairwiseAlign(tree, distanceMatrix, allMacromolecules); // Right name? I think it is supposed to return aligned macromolecules



                    // "Reset" the profiles (does this do anything?)


                    // Do multiple sequence alignment


                    // Check score
                    //     If better, save save current alignments



                    activeMacromolecules.Add(removedMacromolecule);
                }

                if (!improvedThisIteration)
                {
                    // If we haven't improved the score this past iteration, no point in continuing
                    break;
                }
            }

            if (improved)
            {
                // If we've found an improved alignment, commit the best one.
            }
        }
Ejemplo n.º 3
0
        //static ReturnCodes LoadSequencesFromFile(string fileName, out List<Tuple<Macromolecule, int[]>> loadedAlignedMacromolecules)
        static ReturnCodes LoadSequencesFromFile(string fileName, out List <AlignedMacromolecule> loadedAlignedMacromolecules)
        // Opens the file given by fileName, reads it, and passes the loaded macromolecules back as an output parameter.
        // In addition, it replaces blank names with "Unnamed Sequence" and removes
        {
            ReturnCodes        returnCode = ReturnCodes.OK;
            SequenceFileParser fileParser;
            StreamReader       fileReader;

            //loadedAlignedMacromolecules = new List<Tuple<Macromolecule, int[]>>();
            loadedAlignedMacromolecules = new List <AlignedMacromolecule>();

            // Check that fileName is not empty
            if (fileName.Length == 0) // Note: checking the length of a string as 0 is more efficient than comparing it to an empty string.
            {
                return(ReturnCodes.NoFileName);
            }

            // IO operations are vulnerable to file system errors.
            try
            {
                // Open the file
                fileReader = new StreamReader(fileName);

                try
                {
                    // Determine the file type to choose the correct parser.
                    fileParser = DetermineFileType(ref fileReader);

                    // Read the file
                    returnCode = fileParser.ReadFile(out loadedAlignedMacromolecules);
                }
                catch (System.IO.FileNotFoundException ex)
                {
                    return(ReturnCodes.FileNotFound);
                }
                catch (System.IO.IOException ex)
                {
                    return(ReturnCodes.IOException);
                }
                finally
                {
                    // The file is open, so we need to close it
                    fileReader.Close();
                }
            }
            catch (FileNotFoundException ex)
            {
                // File wasn't opened, so just return the error code
                return(ReturnCodes.FileNotFound);
            }

            if (returnCode != ReturnCodes.OK)
            {
                return(returnCode);
            }

            // Check that at least 1 sequence was loaded
            if (loadedAlignedMacromolecules.Count == 0)
            {
                return(ReturnCodes.NoSequencesInFile);
            }

            // Check each sequence to ensure it is not empty.
            // Set the return code to reflect this.
            for (int i = 0; i < loadedAlignedMacromolecules.Count; i++)
            {
                //Tuple<Macromolecule, int[]> alignedMacromolecule = loadedAlignedMacromolecules[i];
                AlignedMacromolecule alignedMacromolecule = loadedAlignedMacromolecules[i];

                //if (alignedMacromolecule.Item1.Sequence.Length == 0) // empty sequence
                if (alignedMacromolecule.Sequence.Length == 0) // empty sequence
                {
                    returnCode = ReturnCodes.EmptySequencesRemoved;
                    loadedAlignedMacromolecules.Remove(alignedMacromolecule);
                }
            }

            // Check that we still have at least 1 sequence
            if (loadedAlignedMacromolecules.Count == 0)
            {
                return(ReturnCodes.NoNonEmptySequencesInFile);
            }

            // Set any no-name molecules to "Unnamed Sequence"
            //foreach (Tuple<Macromolecule, int[]> alignedMacromolecule in loadedAlignedMacromolecules)
            foreach (AlignedMacromolecule alignedMacromolecule in loadedAlignedMacromolecules)
            {
                //Macromolecule macromolecule = alignedMacromolecule.Item1;
                if (alignedMacromolecule.Name.Length == 0) // no name
                {
                    alignedMacromolecule.Name = "Unnamed Sequence";
                }
            }

            return(returnCode);
        }