Ejemplo n.º 1
0
        /// <summary>
        /// Generate a symmetric distance matrix from a set of unaligned sequences.
        /// </summary>
        /// <param name="sequences">a set of unaligned sequences</param>
        public void GenerateDistanceMatrix(IList <ISequence> sequences)
        {
            // Generate k-mer counting dictionary for each sequence
            try
            {
                _allCountsDictionary = new Dictionary <string, float> [sequences.Count];

                Parallel.For(0, sequences.Count, i =>
                {
                    Dictionary <string, float> currentDictionary = KmerDistanceScoreCalculator.CalculateKmerCounting(sequences[i], _kmerLength);
                    MsaUtils.Normalize(currentDictionary);
                    _allCountsDictionary[i] = currentDictionary;
                });
            }
            catch (OutOfMemoryException ex)
            {
                throw new Exception("Out of memory when generating kmer counting", ex.InnerException);
            }

            // Construct a SymmetricDistanceMatrix
            // with dimension equals to the number of sequences
            _distanceMatrix = new SymmetricDistanceMatrix(sequences.Count);

            // Fill in DistanceMatrix
            Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.ParallelOption, row =>
            {
                for (int col = 0; col < row; ++col)
                {
                    float distanceScore = _kmerScoreCalculator.CalculateDistanceScore
                                              (_allCountsDictionary[row], _allCountsDictionary[col]);
                    _distanceMatrix[row, col] = distanceScore;
                    _distanceMatrix[col, row] = distanceScore;
                }
            });
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Generate IProfiles from a set of aligned sequences
        /// </summary>
        /// <param name="sequences">a set of aligned sequences</param>
        /// <param name="weights">sequence weights</param>
        public static IProfiles GenerateProfiles(ICollection <ISequence> sequences, float[] weights)
        {
            if (sequences.Count != weights.Length)
            {
                throw new ArgumentException("Invalid inputs");
            }

            MsaUtils.Normalize(weights);

            IProfiles profiles;
            IEnumerator <ISequence> enumeratorSeq = sequences.GetEnumerator();

            enumeratorSeq.MoveNext();
            int       sequenceLength = (int)enumeratorSeq.Current.Count;
            IAlphabet alphabet       = enumeratorSeq.Current.Alphabet;

            while (enumeratorSeq.MoveNext())
            {
                if (enumeratorSeq.Current.Count != sequenceLength)
                {
                    throw new ArgumentException("Input sequences are not aligned");
                }
                if (enumeratorSeq.Current.Alphabet != alphabet)
                {
                    throw new ArgumentException("Input sequences use different alphabets");
                }
            }

            // each row is a column; each column is a profile
            int colSize = (ItemSet.Count + 1) / 2;

            profiles = new Profiles(sequenceLength, colSize);

            for (int i = 0; i < sequenceLength; ++i)
            {
                enumeratorSeq.Reset();
                while (enumeratorSeq.MoveNext())
                {
                    if (!enumeratorSeq.Current.Alphabet.CheckIsAmbiguous(enumeratorSeq.Current[i])) // IsAmbiguous
                    {
                        for (int b = 0; b < AmbiguousCharactersMap[enumeratorSeq.Current[i]].Count; ++b)
                        {
                            profiles[i][ItemSet[AmbiguousCharactersMap[enumeratorSeq.Current[i]][b]]] += weights[i];
                        }
                    }
                    else
                    {
                        profiles[i][ItemSet[enumeratorSeq.Current[i]]] += weights[i];
                    }
                }
                MsaUtils.Normalize(profiles[i]);
            }
            profiles.ColumnSize = colSize;
            profiles.RowSize    = sequenceLength;
            return(profiles);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Combine two profiles with alignment array results from dynamic programming algorithm.
        /// The dynamic programming algorithm returns two arrays containing the alignment operations
        /// on the two profiles. This method applies the operation information in the two arrays to
        /// the two original profiles, and combine them into a new aligned profile.
        /// </summary>
        /// <param name="profileA">first profile</param>
        /// <param name="profileB">second profile</param>
        /// <param name="numberOfSequencesA">the number of sequences in the first profile</param>
        /// <param name="numberOfSequencesB">the number of sequences in the second profile</param>
        /// <param name="aAligned">aligned interger array generated by dynamic programming</param>
        /// <param name="bAligned">aligned interger array generated by dynamic programming</param>
        /// <param name="gapCode">the gap integer code defined in dynamic programming class</param>
        /// <param name="weights">the weights of two profiles</param>
        public static IProfiles GenerateProfiles(
            IProfiles profileA,
            IProfiles profileB,
            int numberOfSequencesA,
            int numberOfSequencesB,
            int[] aAligned,
            int[] bAligned,
            int gapCode,
            float[] weights)
        {
            if (aAligned.Length != bAligned.Length)
            {
                throw new ArgumentException("not aligned sequences");
            }
            IProfiles profiles = new Profiles(aAligned.Length, profileA.ColumnSize);

            MsaUtils.Normalize(weights);

            // a profile with gap only
            float[] gapProfile = new float[profiles.ColumnSize];
            gapProfile[gapProfile.Length - 1] = 1;

            for (int i = 0; i < aAligned.Length; ++i)
            {
                if (aAligned[i] == gapCode && bAligned[i] == gapCode)
                {
                    throw new Exception("Both positions are gap between two sets of sequences");
                }
                if (aAligned[i] == gapCode)
                {
                    for (int j = 0; j < profiles.ColumnSize; ++j)
                    {
                        profiles[i][j] = ((gapProfile[j] * numberOfSequencesA * weights[0]) + (profileB[bAligned[i]][j] * numberOfSequencesB * weights[1]))
                                         / (numberOfSequencesA + numberOfSequencesB);
                    }
                }
                else if (bAligned[i] == gapCode)
                {
                    for (int j = 0; j < profiles.ColumnSize; ++j)
                    {
                        profiles[i][j] = ((gapProfile[j] * numberOfSequencesA * weights[0]) + (profileA[aAligned[i]][j] * numberOfSequencesB * weights[1]))
                                         / (numberOfSequencesA + numberOfSequencesB);
                    }
                }
                else
                {
                    for (int j = 0; j < profiles.ColumnSize; ++j)
                    {
                        profiles[i][j] = ((profileA[aAligned[i]][j] * numberOfSequencesA * weights[0]) + (profileB[bAligned[i]][j] * numberOfSequencesB * weights[1]))
                                         / (numberOfSequencesA + numberOfSequencesB);
                    }
                }
            }
            return(profiles);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Performs Stage 1, 2, and 3 as described in class description.
        /// </summary>
        /// <param name="inputSequences">Input sequences</param>
        /// <returns>Alignment results</returns>
        public IList <Alignment.ISequenceAlignment> Align(IEnumerable <ISequence> inputSequences)
        {
            // Reset all our data in case this same instance is used multiple times.
            _alignedSequences = _alignedSequencesA = _alignedSequencesB = _alignedSequencesC = null;
            _alignmentScore   = _alignmentScoreA = _alignmentScoreB = _alignmentScoreC = float.MinValue;

            // Get our list of sequences.
            List <ISequence> sequences = inputSequences.ToList();

            if (sequences.Count == 0)
            {
                throw new ArgumentException("Empty input sequences");
            }

            // Assign the gap open/extension cost if it hasn't been assigned.
            if (GapOpenCost == 0)
            {
                GapOpenCost = -4;
            }
            if (GapExtensionCost == 0)
            {
                GapExtensionCost = -1;
            }

            Performance.Start();

            // Assign the alphabet
            SetAlphabet(sequences, SimilarityMatrix, true);
            MsaUtils.SetProfileItemSets(_alphabet);

            Performance.Snapshot("Start Aligning");

            // Work...
            DoAlignment(sequences);

            // just for the purpose of integrating PW and MSA with the same output
            var alignment = new Alignment.SequenceAlignment();
            IAlignedSequence aSequence = new AlignedSequence();

            foreach (var alignedSequence in AlignedSequences)
            {
                aSequence.Sequences.Add(alignedSequence);
            }
            foreach (var inputSequence in sequences)
            {
                alignment.Sequences.Add(inputSequence);
            }
            alignment.AlignedSequences.Add(aSequence);
            return(new List <Alignment.ISequenceAlignment>()
            {
                alignment
            });
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Performs Stage 1, 2, and 3 as described in class description.
        /// </summary>
        /// <param name="inputSequences"></param>
        /// <returns></returns>
        public IList <Bio.Algorithms.Alignment.ISequenceAlignment> Align(IEnumerable <ISequence> inputSequences)
        {
            List <ISequence> sequences = inputSequences.ToList();

            // Initializations
            if (sequences.Count > 0)
            {
                if (ConsensusResolver == null)
                {
                    ConsensusResolver = new SimpleConsensusResolver(_alphabet);
                }
                else
                {
                    ConsensusResolver.SequenceAlphabet = _alphabet;
                }
            }

            // Get ProfileAligner ready
            IProfileAligner profileAligner = null;

            switch (_profileAlignerName)
            {
            case (ProfileAlignerNames.NeedlemanWunschProfileAligner):
                if (_degreeOfParallelism == 1)
                {
                    profileAligner = new NeedlemanWunschProfileAlignerSerial(
                        SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions);
                }
                else
                {
                    profileAligner = new NeedlemanWunschProfileAlignerParallel(
                        SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions);
                }
                break;

            case (ProfileAlignerNames.SmithWatermanProfileAligner):
                if (_degreeOfParallelism == 1)
                {
                    profileAligner = new SmithWatermanProfileAlignerSerial(
                        SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions);
                }
                else
                {
                    profileAligner = new SmithWatermanProfileAlignerParallel(
                        SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions);
                }
                break;

            default:
                throw new ArgumentException("Invalid profile aligner name");
            }

            _alignedSequences = new List <ISequence>(sequences.Count);
            float currentScore = 0;

            // STAGE 1

            Performance.Snapshot("Stage 1");
            // Generate DistanceMatrix
            KmerDistanceMatrixGenerator kmerDistanceMatrixGenerator =
                new KmerDistanceMatrixGenerator(sequences, _kmerLength, _alphabet, _distanceFunctionName);

            // Hierarchical clustering
            IHierarchicalClustering hierarcicalClustering =
                new HierarchicalClusteringParallel
                    (kmerDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName);

            // Generate Guide Tree
            BinaryGuideTree binaryGuideTree =
                new BinaryGuideTree(hierarcicalClustering);

            // Progressive Alignment
            IProgressiveAligner progressiveAlignerA = new ProgressiveAligner(profileAligner);

            progressiveAlignerA.Align(sequences, binaryGuideTree);

            currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerA.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost);
            if (currentScore > _alignmentScoreA)
            {
                _alignmentScoreA   = currentScore;
                _alignedSequencesA = progressiveAlignerA.AlignedSequences;
            }
            if (_alignmentScoreA > _alignmentScore)
            {
                _alignmentScore   = _alignmentScoreA;
                _alignedSequences = _alignedSequencesA;
            }

            if (PAMSAMMultipleSequenceAligner.FasterVersion)
            {
                _alignedSequencesB = _alignedSequencesA;
                _alignedSequencesC = _alignedSequencesA;
                _alignmentScoreB   = _alignmentScoreA;
                _alignmentScoreC   = _alignmentScoreA;
            }
            else
            {
                BinaryGuideTree               binaryGuideTreeB              = null;
                IHierarchicalClustering       hierarcicalClusteringB        = null;
                KimuraDistanceMatrixGenerator kimuraDistanceMatrixGenerator = new KimuraDistanceMatrixGenerator();

                if (PAMSAMMultipleSequenceAligner.UseStageB)
                {
                    // STAGE 2
                    Performance.Snapshot("Stage 2");
                    // Generate DistanceMatrix from Multiple Sequence Alignment

                    int iterateTime = 0;

                    while (true)
                    {
                        ++iterateTime;
                        kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(_alignedSequences);

                        // Hierarchical clustering
                        hierarcicalClusteringB = new HierarchicalClusteringParallel
                                                     (kimuraDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName);

                        // Generate Guide Tree
                        binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB);

                        BinaryGuideTree.CompareTwoTrees(binaryGuideTreeB, binaryGuideTree);
                        binaryGuideTree = binaryGuideTreeB;

                        // Progressive Alignment
                        IProgressiveAligner progressiveAlignerB = new ProgressiveAligner(profileAligner);
                        progressiveAlignerB.Align(sequences, binaryGuideTreeB);

                        currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerB.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost);

                        if (currentScore > _alignmentScoreB)
                        {
                            _alignmentScoreB   = currentScore;
                            _alignedSequencesB = progressiveAlignerB.AlignedSequences;
                            break;
                        }
                        else
                        {
                            break;
                        }
                    }
                    if (_alignmentScoreB > _alignmentScore)
                    {
                        _alignmentScore   = _alignmentScoreB;
                        _alignedSequences = _alignedSequencesB;
                    }
                }
                else
                {
                    binaryGuideTreeB = binaryGuideTree;
                }


                // STAGE 3
                Performance.Snapshot("Stage 3");
                // refinement
                //int maxRefineMentTime = sequences.Count * 2 - 2;
                int maxRefineMentTime = 1;
                if (sequences.Count == 2)
                {
                    maxRefineMentTime = 0;
                }

                int refinementTime = 0;
                _alignedSequencesC = new List <ISequence>(sequences.Count);
                for (int i = 0; i < sequences.Count; ++i)
                {
                    _alignedSequencesC.Add(
                        new Sequence(Alphabets.GetAmbiguousAlphabet(_alphabet),
                                     _alignedSequences[i].ToArray())
                    {
                        ID       = _alignedSequences[i].ID,
                        Metadata = _alignedSequences[i].Metadata
                    });
                }

                List <int>[]        leafNodeIndices            = null;
                List <int>[]        allIndelPositions          = null;
                IProfileAlignment[] separatedProfileAlignments = null;
                List <int>[]        eStrings = null;

                while (refinementTime < maxRefineMentTime)
                {
                    ++refinementTime;
                    Performance.Snapshot("Refinement iter " + refinementTime.ToString());
                    bool needRefinement = false;
                    for (int edgeIndex = 0; edgeIndex < binaryGuideTreeB.NumberOfEdges; ++edgeIndex)
                    {
                        leafNodeIndices = binaryGuideTreeB.SeparateSequencesByCuttingTree(edgeIndex);

                        allIndelPositions = new List <int> [2];

                        separatedProfileAlignments = ProfileAlignment.ProfileExtraction(_alignedSequencesC, leafNodeIndices[0], leafNodeIndices[1], out allIndelPositions);
                        eStrings = new List <int> [2];

                        if (separatedProfileAlignments[0].NumberOfSequences < separatedProfileAlignments[1].NumberOfSequences)
                        {
                            profileAligner.Align(separatedProfileAlignments[0], separatedProfileAlignments[1]);
                            eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedA);
                            eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedB);
                        }
                        else
                        {
                            profileAligner.Align(separatedProfileAlignments[1], separatedProfileAlignments[0]);
                            eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedB);
                            eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedA);
                        }

                        for (int set = 0; set < 2; ++set)
                        {
                            Parallel.ForEach(leafNodeIndices[set], PAMSAMMultipleSequenceAligner.parallelOption, i =>
                            {
                                //Sequence seq = new Sequence(_alphabet, "");
                                List <byte> seqBytes = new List <byte>();

                                int indexAllIndel = 0;
                                for (int j = 0; j < _alignedSequencesC[i].Count; ++j)
                                {
                                    if (indexAllIndel < allIndelPositions[set].Count && j == allIndelPositions[set][indexAllIndel])
                                    {
                                        ++indexAllIndel;
                                    }
                                    else
                                    {
                                        seqBytes.Add(_alignedSequencesC[i][j]);
                                    }
                                }

                                _alignedSequencesC[i]    = profileAligner.GenerateSequenceFromEString(eStrings[set], new Sequence(Alphabets.GetAmbiguousAlphabet(_alphabet), seqBytes.ToArray()));
                                _alignedSequencesC[i].ID = _alignedSequencesC[i].ID;
                                (_alignedSequencesC[i] as Sequence).Metadata = _alignedSequencesC[i].Metadata;
                            });
                        }

                        currentScore = MsaUtils.MultipleAlignmentScoreFunction(_alignedSequencesC, SimilarityMatrix, GapOpenCost, GapExtensionCost);

                        if (currentScore > _alignmentScoreC)
                        {
                            _alignmentScoreC = currentScore;
                            needRefinement   = true;

                            // recreate the tree
                            kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(_alignedSequencesC);
                            hierarcicalClusteringB = new HierarchicalClusteringParallel
                                                         (kimuraDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName);

                            binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB);
                            break;
                        }
                    }
                    if (!needRefinement)
                    {
                        refinementTime = maxRefineMentTime;
                        break;
                    }
                }
                if (_alignmentScoreC > _alignmentScore)
                {
                    _alignmentScore   = _alignmentScoreC;
                    _alignedSequences = _alignedSequencesC;
                }
                Performance.Snapshot("Stop Stage 3");
            }

            //just for the purpose of integrating PW and MSA with the same output
            IList <Bio.Algorithms.Alignment.ISequenceAlignment> results = new List <Bio.Algorithms.Alignment.ISequenceAlignment>();

            return(results);
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Construct an aligner
        /// </summary>
        /// <param name="sequences">input sequences</param>
        /// <param name="kmerLength">positive integer of kmer length</param>
        /// <param name="distanceFunctionName">enum: distance function name</param>
        /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param>
        /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param>
        /// <param name="profileFunctionName">enum: profile-profile distance function</param>
        /// <param name="similarityMatrix">similarity matrix</param>
        /// <param name="gapOpenPenalty">negative gapOpenPenalty</param>
        /// <param name="gapExtendPenalty">negative gapExtendPenalty</param>
        /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param>
        /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param>
        public PAMSAMMultipleSequenceAligner(
            IList <ISequence> sequences,
            int kmerLength,
            DistanceFunctionTypes distanceFunctionName,
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
            ProfileAlignerNames profileAlignerMethodName,
            ProfileScoreFunctionNames profileFunctionName,
            SimilarityMatrix similarityMatrix,
            int gapOpenPenalty,
            int gapExtendPenalty,
            int numberOfPartitions,
            int degreeOfParallelism)
        {
            Performance.Start();

            if (null == sequences)
            {
                throw new ArgumentNullException("sequences");
            }

            if (sequences.Count == 0)
            {
                throw new ArgumentException("Empty input sequences");
            }

            // Set parallel extension option
            if (degreeOfParallelism <= 0)
            {
                throw new ArgumentException("Invalid parallel degree parameter");
            }
            PAMSAMMultipleSequenceAligner.parallelOption = new ParallelOptions {
                MaxDegreeOfParallelism = degreeOfParallelism
            };

            if (numberOfPartitions <= 0)
            {
                throw new ArgumentException("Invalid number of partition parameter");
            }
            _numberOfPartitions = numberOfPartitions;

            // Validate data type
            _alphabet = sequences[0].Alphabet;
            Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, i =>
            {
                if (!Alphabets.CheckIsFromSameBase(sequences[i].Alphabet, _alphabet))
                {
                    throw new ArgumentException("Inconsistent sequence alphabet");
                }
            });

            List <String> similarityMatrixDNA = new List <String>();

            similarityMatrixDNA.Add("AmbiguousDNA");

            List <String> similarityMatrixRNA = new List <String>();

            similarityMatrixRNA.Add("AmbiguousRNA");

            List <String> similarityMatrixProtein = new List <String>();

            similarityMatrixProtein.Add("BLOSUM45");
            similarityMatrixProtein.Add("BLOSUM50");
            similarityMatrixProtein.Add("BLOSUM62");
            similarityMatrixProtein.Add("BLOSUM80");
            similarityMatrixProtein.Add("BLOSUM90");
            similarityMatrixProtein.Add("PAM250");
            similarityMatrixProtein.Add("PAM30");
            similarityMatrixProtein.Add("PAM70");

            if (_alphabet is DnaAlphabet)
            {
                if (!similarityMatrixDNA.Contains(similarityMatrix.Name))
                {
                    throw new ArgumentException("Inconsistent similarity matrix");
                }
            }
            else if (_alphabet is ProteinAlphabet)
            {
                if (!similarityMatrixProtein.Contains(similarityMatrix.Name))
                {
                    throw new ArgumentException("Inconsistent similarity matrix");
                }
            }
            else if (_alphabet is RnaAlphabet)
            {
                if (!similarityMatrixRNA.Contains(similarityMatrix.Name))
                {
                    throw new ArgumentException("Inconsistent similarity matrix");
                }
            }
            else
            {
                throw new ArgumentException("Invalid alphabet");
            }

            // Initialize parameters
            _kmerLength                       = kmerLength;
            _distanceFunctionName             = distanceFunctionName;
            _hierarchicalClusteringMethodName = hierarchicalClusteringMethodName;
            _profileAlignerName               = profileAlignerMethodName;
            _profileProfileFunctionName       = profileFunctionName;
            SimilarityMatrix                  = similarityMatrix;
            GapOpenCost                       = gapOpenPenalty;
            GapExtensionCost                  = gapExtendPenalty;

            MsaUtils.SetProfileItemSets(_alphabet);

            Performance.Snapshot("Start Aligning");

            // Work...
            Align(sequences);
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Construct an aligner and run the alignment.
        /// </summary>
        /// <param name="sequences">input sequences</param>
        /// <param name="kmerLength">positive integer of kmer length</param>
        /// <param name="distanceFunctionName">enum: distance function name</param>
        /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param>
        /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param>
        /// <param name="profileFunctionName">enum: profile-profile distance function</param>
        /// <param name="similarityMatrix">similarity matrix</param>
        /// <param name="gapOpenPenalty">negative gapOpenPenalty</param>
        /// <param name="gapExtendPenalty">negative gapExtendPenalty</param>
        /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param>
        /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param>
        public PAMSAMMultipleSequenceAligner(
            IList <ISequence> sequences,
            int kmerLength,
            DistanceFunctionTypes distanceFunctionName,
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
            ProfileAlignerNames profileAlignerMethodName,
            ProfileScoreFunctionNames profileFunctionName,
            SimilarityMatrix similarityMatrix,
            int gapOpenPenalty,
            int gapExtendPenalty,
            int numberOfPartitions,
            int degreeOfParallelism)
        {
            Performance.Start();

            if (null == sequences)
            {
                throw new ArgumentNullException("sequences");
            }

            if (sequences.Count == 0)
            {
                throw new ArgumentException("Empty input sequences");
            }

            // Set parallel extension option
            if (degreeOfParallelism <= 0)
            {
                throw new ArgumentException("Invalid parallel degree parameter");
            }

            //_degreeOfParallelism = degreeOfParallelism;
            parallelOption = new ParallelOptions {
                MaxDegreeOfParallelism = degreeOfParallelism
            };

            if (numberOfPartitions <= 0)
            {
                throw new ArgumentException("Invalid number of partition parameter");
            }
            _numberOfPartitions = numberOfPartitions;

            // Assign the alphabet
            SetAlphabet(sequences, similarityMatrix, false);

            // Initialize parameters
            KmerLength                       = kmerLength;
            DistanceFunctionName             = distanceFunctionName;
            HierarchicalClusteringMethodName = hierarchicalClusteringMethodName;
            ProfileAlignerName               = profileAlignerMethodName;
            ProfileProfileFunctionName       = profileFunctionName;
            SimilarityMatrix                 = similarityMatrix;
            GapOpenCost                      = gapOpenPenalty;
            GapExtensionCost                 = gapExtendPenalty;

            MsaUtils.SetProfileItemSets(_alphabet);

            Performance.Snapshot("Start Aligning");

            // Work...
            DoAlignment(sequences);
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Generate IProfiles from a subset of aligned sequences.
        /// In the subset of sequences, those columns containing no residues,
        /// i.e. indels only, are discarded.
        /// </summary>
        /// <param name="sequences">a set of aligned sequences</param>
        /// <param name="sequenceIndices">the subset indices of the aligned sequences</param>
        /// <param name="allIndelPositions">the list of all-indel positions that have been removed when constructing</param>
        /// <param name="weights">sequence weights</param>
        public static IProfiles GenerateProfiles(List <ISequence> sequences, List <int> sequenceIndices, out List <int> allIndelPositions, float[] weights)
        {
            IProfiles profiles;

            if (sequences.Count <= 0)
            {
                throw new ArgumentException("Empty input sequences");
            }
            if (sequenceIndices.Count > sequences.Count)
            {
                throw new ArgumentException("Invalid subset indices");
            }

            MsaUtils.Normalize(weights);

            try
            {
                int       sequenceLength = (int)sequences[sequenceIndices[0]].Count;
                IAlphabet alphabet       = sequences[sequenceIndices[0]].Alphabet;

                foreach (int i in sequenceIndices)
                {
                    if (sequences[i].Count != sequenceLength)
                    {
                        throw new ArgumentException("Input sequences are not aligned");
                    }
                    if (sequences[i].Alphabet != alphabet)
                    {
                        throw new ArgumentException("Input sequences use different alphabets");
                    }
                }

                allIndelPositions = new List <int>();

                profiles = new Profiles();
                int colSize = (ItemSet.Count + 1) / 2;

                // Discard all indels columns.
                for (int col = 0; col < sequenceLength; ++col)
                {
                    float[] vector      = new float[colSize];
                    bool    isAllIndels = true;
                    foreach (int i in sequenceIndices)
                    {
                        if (!sequences[i].Alphabet.CheckIsGap(sequences[i][col]))
                        {
                            isAllIndels = false;
                        }
                        if (sequences[i].Alphabet.CheckIsAmbiguous(sequences[i][col]))
                        {
                            //Console.WriteLine("residue {0} is {1}, ambiguous? {2}", i, seq[i].Symbol, seq[i].IsAmbiguous);
                            for (int b = 0; b < AmbiguousCharactersMap[sequences[i][col]].Count; ++b)
                            {
                                vector[ItemSet[AmbiguousCharactersMap[sequences[i][col]][b]]] += weights[i];
                            }
                        }
                        else
                        {
                            vector[ItemSet[sequences[i][col]]] += weights[i];
                        }
                    }
                    if (!isAllIndels)
                    {
                        MsaUtils.Normalize(vector);
                        profiles.ProfilesMatrix.Add(vector);
                    }
                    else
                    {
                        allIndelPositions.Add(col);
                    }
                }
                profiles.ColumnSize = colSize;
                profiles.RowSize    = profiles.ProfilesMatrix.Count;
            }
            catch (IndexOutOfRangeException ex)
            {
                throw new Exception("Invalid index", ex.InnerException);
            }
            return(profiles);
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Performs Stage 1, 2, and 3 as described in class description.
        /// </summary>
        /// <param name="sequences">Input sequences</param>
        /// <returns>Alignment results</returns>
        private void DoAlignment(IList <ISequence> sequences)
        {
            Debug.Assert(this.alphabet != null);
            Debug.Assert(sequences.Count > 0);

            // Initializations
            if (ConsensusResolver == null)
            {
                ConsensusResolver = new SimpleConsensusResolver(this.alphabet);
            }
            else
            {
                ConsensusResolver.SequenceAlphabet = this.alphabet;
            }

            // Get ProfileAligner ready
            IProfileAligner profileAligner = null;

            switch (ProfileAlignerName)
            {
            case (ProfileAlignerNames.NeedlemanWunschProfileAligner):
                if (this.degreeOfParallelism == 1)
                {
                    profileAligner = new NeedlemanWunschProfileAlignerSerial(
                        SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions);
                }
                else
                {
                    profileAligner = new NeedlemanWunschProfileAlignerParallel(
                        SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions);
                }
                break;

            case (ProfileAlignerNames.SmithWatermanProfileAligner):
                if (this.degreeOfParallelism == 1)
                {
                    profileAligner = new SmithWatermanProfileAlignerSerial(
                        SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions);
                }
                else
                {
                    profileAligner = new SmithWatermanProfileAlignerParallel(
                        SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions);
                }
                break;

            default:
                throw new ArgumentException("Invalid profile aligner name");
            }

            this.AlignedSequences = new List <ISequence>(sequences.Count);
            float currentScore = 0;

            // STAGE 1

            ReportLog("Stage 1");
            // Generate DistanceMatrix
            var kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, KmerLength, this.alphabet, DistanceFunctionName);

            // Hierarchical clustering
            IHierarchicalClustering hierarcicalClustering =
                new HierarchicalClusteringParallel
                    (kmerDistanceMatrixGenerator.DistanceMatrix, HierarchicalClusteringMethodName);

            // Generate Guide Tree
            var binaryGuideTree = new BinaryGuideTree(hierarcicalClustering);

            // Progressive Alignment
            IProgressiveAligner progressiveAlignerA = new ProgressiveAligner(profileAligner);

            progressiveAlignerA.Align(sequences, binaryGuideTree);

            currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerA.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost);
            if (currentScore > this.AlignmentScoreA)
            {
                this.AlignmentScoreA   = currentScore;
                this.AlignedSequencesA = progressiveAlignerA.AlignedSequences;
            }
            if (this.AlignmentScoreA > this.AlignmentScore)
            {
                this.AlignmentScore   = this.AlignmentScoreA;
                this.AlignedSequences = this.AlignedSequencesA;
            }

            if (PAMSAMMultipleSequenceAligner.FasterVersion)
            {
                this.AlignedSequencesB = this.AlignedSequencesA;
                this.AlignedSequencesC = this.AlignedSequencesA;
                this.AlignmentScoreB   = this.AlignmentScoreA;
                this.AlignmentScoreC   = this.AlignmentScoreA;
            }
            else
            {
                BinaryGuideTree               binaryGuideTreeB              = null;
                IHierarchicalClustering       hierarcicalClusteringB        = null;
                KimuraDistanceMatrixGenerator kimuraDistanceMatrixGenerator = new KimuraDistanceMatrixGenerator();

                if (UseStageB)
                {
                    // STAGE 2
                    ReportLog("Stage 2");
                    // Generate DistanceMatrix from Multiple Sequence Alignment

                    while (true)
                    {
                        kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(this.AlignedSequences);

                        // Hierarchical clustering
                        hierarcicalClusteringB = new HierarchicalClusteringParallel
                                                     (kimuraDistanceMatrixGenerator.DistanceMatrix, HierarchicalClusteringMethodName);

                        // Generate Guide Tree
                        binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB);

                        BinaryGuideTree.CompareTwoTrees(binaryGuideTreeB, binaryGuideTree);
                        binaryGuideTree = binaryGuideTreeB;

                        // Progressive Alignment
                        IProgressiveAligner progressiveAlignerB = new ProgressiveAligner(profileAligner);
                        progressiveAlignerB.Align(sequences, binaryGuideTreeB);

                        currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerB.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost);

                        if (currentScore > this.AlignmentScoreB)
                        {
                            this.AlignmentScoreB   = currentScore;
                            this.AlignedSequencesB = progressiveAlignerB.AlignedSequences;
                        }
                        break;
                    }
                    if (this.AlignmentScoreB > this.AlignmentScore)
                    {
                        this.AlignmentScore   = this.AlignmentScoreB;
                        this.AlignedSequences = this.AlignedSequencesB;
                    }
                }
                else
                {
                    binaryGuideTreeB = binaryGuideTree;
                }


                // STAGE 3
                ReportLog("Stage 3");
                // refinement
                int maxRefineMentTime = 1;
                if (sequences.Count == 2)
                {
                    maxRefineMentTime = 0;
                }

                int refinementTime = 0;
                this.AlignedSequencesC = new List <ISequence>(this.AlignedSequences.Count);
                foreach (ISequence t in this.AlignedSequences)
                {
                    this.AlignedSequencesC.Add(new Sequence(Alphabets.GetAmbiguousAlphabet(this.alphabet), t.ToArray())
                    {
                        ID = t.ID,
                        // Do not shallow copy dictionary
                        //Metadata = t.Metadata
                    });
                }

                while (refinementTime < maxRefineMentTime)
                {
                    ++refinementTime;
                    ReportLog("Refinement iter " + refinementTime);
                    bool needRefinement = false;
                    for (int edgeIndex = 0; edgeIndex < binaryGuideTreeB.NumberOfEdges; ++edgeIndex)
                    {
                        List <int>[] leafNodeIndices = binaryGuideTreeB.SeparateSequencesByCuttingTree(edgeIndex);

                        List <int>[] allIndelPositions = new List <int> [2];

                        IProfileAlignment[] separatedProfileAlignments = ProfileAlignment.ProfileExtraction(this.AlignedSequencesC, leafNodeIndices[0], leafNodeIndices[1], out allIndelPositions);
                        List <int>[]        eStrings = new List <int> [2];

                        if (separatedProfileAlignments[0].NumberOfSequences < separatedProfileAlignments[1].NumberOfSequences)
                        {
                            profileAligner.Align(separatedProfileAlignments[0], separatedProfileAlignments[1]);
                            eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedA);
                            eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedB);
                        }
                        else
                        {
                            profileAligner.Align(separatedProfileAlignments[1], separatedProfileAlignments[0]);
                            eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedB);
                            eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedA);
                        }

                        for (int set = 0; set < 2; ++set)
                        {
                            Parallel.ForEach(leafNodeIndices[set], ParallelOption, i =>
                            {
                                //Sequence seq = new Sequence(_alphabet, "");
                                List <byte> seqBytes = new List <byte>();

                                int indexAllIndel = 0;
                                for (int j = 0; j < this.AlignedSequencesC[i].Count; ++j)
                                {
                                    if (indexAllIndel < allIndelPositions[set].Count && j == allIndelPositions[set][indexAllIndel])
                                    {
                                        ++indexAllIndel;
                                    }
                                    else
                                    {
                                        seqBytes.Add(this.AlignedSequencesC[i][j]);
                                    }
                                }

                                this.AlignedSequencesC[i]    = profileAligner.GenerateSequenceFromEString(eStrings[set], new Sequence(Alphabets.GetAmbiguousAlphabet(this.alphabet), seqBytes.ToArray()));
                                this.AlignedSequencesC[i].ID = this.AlignedSequencesC[i].ID;
                                // Do not shallow copy dictionary
                                //(_alignedSequencesC[i] as Sequence).Metadata = _alignedSequencesC[i].Metadata;
                            });
                        }

                        currentScore = MsaUtils.MultipleAlignmentScoreFunction(this.AlignedSequencesC, SimilarityMatrix, GapOpenCost, GapExtensionCost);

                        if (currentScore > this.AlignmentScoreC)
                        {
                            this.AlignmentScoreC = currentScore;
                            needRefinement       = true;

                            // recreate the tree
                            kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(this.AlignedSequencesC);
                            hierarcicalClusteringB = new HierarchicalClusteringParallel
                                                         (kimuraDistanceMatrixGenerator.DistanceMatrix, HierarchicalClusteringMethodName);

                            binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB);
                            break;
                        }
                    }
                    if (!needRefinement)
                    {
                        refinementTime = maxRefineMentTime;
                        break;
                    }
                }
                if (this.AlignmentScoreC > this.AlignmentScore)
                {
                    this.AlignmentScore   = this.AlignmentScoreC;
                    this.AlignedSequences = this.AlignedSequencesC;
                }
                ReportLog("Stop Stage 3");
            }
        }