internal static int[] SelectMedoids(IDistanceMatrix distance, int[] assignments)
        {
            var medoids  = new List <int>();
            var clusters = assignments.Select((x, i) => new { Value = x, Index = i })
                           .GroupBy(obj => obj.Value)
                           .Select(grp => grp.Select(obj => obj.Index).ToArray()).ToArray();

            foreach (var cluster in clusters)
            {
                var withinClusterDistance = double.PositiveInfinity;
                var medoid = 0;
                foreach (var newMedoid in cluster)
                {
                    var newWithinClusterDistance = 0.0;
                    foreach (var point in cluster)
                    {
                        newWithinClusterDistance += distance[newMedoid, point];
                    }
                    if (newWithinClusterDistance < withinClusterDistance)
                    {
                        withinClusterDistance = newWithinClusterDistance;
                        medoid = newMedoid;
                    }
                }
                medoids.Add(medoid);
            }
            return(medoids.ToArray());
        }
 internal static int[] SelectInitialMedoids(IDistanceMatrix distance, int k, int n)
 {
     double[] sumD = new double[n];
     for (int i = 0; i < n; i++)
     {
         double sumDil = 0.0;
         for (int l = 0; l < n; l++)
         {
             sumDil += distance[i, l];
         }
         sumD[i] = sumDil;
     }
     double[] v = new double[n];
     for (int j = 0; j < n; j++)
     {
         double vj = 0.0;
         for (int i = 0; i < n; i++)
         {
             vj += distance[i, j] / sumD[i];
         }
         v[j] = vj;
     }
     int[] idx = new int[n];
     for (int i = 0; i < n; i++)
     {
         idx[i] = i;
     }
     Array.Sort(v, idx);
     int[] medoids = new int[k];
     Array.Copy(idx, medoids, k);
     return(medoids);
 }
示例#3
0
 public GpsService(IGpsDistanceHelper gpsDistanceHelper, IGpsTimeHelper gpsTimeHelper, IDistanceMatrix distanceMatrix, IGpsContext context)
 {
     this.gpsDistanceHelper = gpsDistanceHelper;
     this.gpsTimeHelper     = gpsTimeHelper;
     this.distanceMatrix    = distanceMatrix;
     this.context           = context;
 }
示例#4
0
        /// <summary>
        /// Generate a symmetric distance matrix from a set of unaligned sequences.
        /// </summary>
        /// <param name="sequences">a set of unaligned sequences</param>
        public void GenerateDistanceMatrix(IList <ISequence> sequences)
        {
            // Generate k-mer counting dictionary for each sequence
            try
            {
                _allCountsDictionary = new Dictionary <string, float> [sequences.Count];

                Parallel.For(0, sequences.Count, i =>
                {
                    Dictionary <string, float> currentDictionary = KmerDistanceScoreCalculator.CalculateKmerCounting(sequences[i], _kmerLength);
                    MsaUtils.Normalize(currentDictionary);
                    _allCountsDictionary[i] = currentDictionary;
                });
            }
            catch (OutOfMemoryException ex)
            {
                throw new Exception("Out of memory when generating kmer counting", ex.InnerException);
            }

            // Construct a SymmetricDistanceMatrix
            // with dimension equals to the number of sequences
            _distanceMatrix = new SymmetricDistanceMatrix(sequences.Count);

            // Fill in DistanceMatrix
            Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, row =>
            {
                for (int col = 0; col < row; ++col)
                {
                    float distanceScore = _kmerScoreCalculator.CalculateDistanceScore
                                              (_allCountsDictionary[row], _allCountsDictionary[col]);
                    _distanceMatrix[row, col] = distanceScore;
                    _distanceMatrix[col, row] = distanceScore;
                }
            });
        }
示例#5
0
        /// <summary>
        /// Update the distance between the new cluster with the rest clusters
        /// </summary>
        /// <param name="distanceMatrix">distance matrix</param>
        private void UpdateDistance(IDistanceMatrix distanceMatrix)
        {
            _smallestDistance = float.MaxValue;

            int nextIndex = Nodes[_currentClusterID].SequenceID;

            distanceMatrix.NearestDistances[nextIndex] = float.MaxValue;

            Parallel.ForEach(_clusters, PAMSAMMultipleSequenceAligner.parallelOption, i =>
            {
                if (i != _currentClusterID)
                {
                    int currentIndex = Nodes[i].SequenceID;

                    // Update distance of the newly merged cluster with another cluster
                    _currentDistance = _updateDistanceMethod(distanceMatrix, Nodes[_nextA].SequenceID, Nodes[_nextB].SequenceID, currentIndex);

                    // Update distance matrix
                    distanceMatrix[currentIndex, nextIndex] = _currentDistance;

                    // Update distance matrix NearestNeighbors
                    if (distanceMatrix.NearestNeighbors[currentIndex] == Nodes[_nextA].SequenceID || distanceMatrix.NearestNeighbors[currentIndex] == Nodes[_nextB].SequenceID)
                    {
                        // Update NearestDistance and NearestNeighbors for column currentIndex
                        UpdateNearestColumn(distanceMatrix, currentIndex);
                    }
                }
            });
        }
        internal static double CalculateCost(IDistanceMatrix distance, int[] assignments)
        {
            var cost = 0.0;

            for (int i = 0; i < assignments.Length; i++)
            {
                cost += distance[i, assignments[i]];
            }
            return(cost);
        }
示例#7
0
 /// <summary>
 /// Update the distance between new clusters with the rest clusters
 /// </summary>
 private void UpdateDistance(IDistanceMatrix distanceMatrix)
 {
     foreach (int i in _clusters)
     {
         if (i != _nextA && i != _nextB)
         {
             distanceMatrix[Nodes[_currentClusterID].SequenceID, Nodes[i].SequenceID] =
                 _updateDistanceMethod(distanceMatrix, Nodes[_nextA].SequenceID, Nodes[_nextB].SequenceID, Nodes[i].SequenceID);
         }
     }
 }
示例#8
0
        /// <summary>
        /// Construct clusters using different update methods
        /// </summary>
        /// <param name="distanceMatrix">IDistanceMatrix</param>
        /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param>
        public HierarchicalClustering(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName)
        {
            if (distanceMatrix.Dimension <= 0)
            {
                throw new Exception("Invalid distance matrix dimension");
            }

            try
            {
                // The number of nodes in the final tree is 2N-2:
                // N sequence nodes (leaves) and N-2 internal nodes
                // where N is the number of input sequences
                _nodes = new List <BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 1);
                _edges = new List <BinaryGuideTreeEdge>(distanceMatrix.Dimension * 2 - 2);

                // The number of clusters is the number of leaves at the beginning
                // As the algorithm merges clusters, only one cluster remains.
                _clusters = new List <int>(distanceMatrix.Dimension);

                // Construct _indexToCluster
                _indexToCluster = new int[distanceMatrix.Dimension];
                for (int i = 0; i < distanceMatrix.Dimension; ++i)
                {
                    _indexToCluster[i] = i;
                }
            }
            catch (OutOfMemoryException ex)
            {
                throw new Exception("Out of memory", ex.InnerException);
            }

            // Choose a update-distance method
            switch (updateDistanceMethodName)
            {
            case (UpdateDistanceMethodsTypes.Average):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage);
                break;

            case (UpdateDistanceMethodsTypes.Single):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle);
                break;

            case (UpdateDistanceMethodsTypes.Complete):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete);
                break;

            case (UpdateDistanceMethodsTypes.WeightedMAFFT):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT);
                break;

            default:
                throw new Exception("invalid update method");
            }
        }
示例#9
0
 /// <summary>
 /// Initialize: make each sequence a cluster
 /// </summary>
 /// <param name="distanceMatrix">distance matrix</param>
 private void Initialize(IDistanceMatrix distanceMatrix)
 {
     _numberOfClusters = distanceMatrix.Dimension;
     for (int i = 0; i < _numberOfClusters; ++i)
     {
         // Both node ID and sequence ID equal to the sequence index
         _nodes.Add(new BinaryGuideTreeNode(i));
         _clusters.Add(i);
     }
     _currentClusterID = distanceMatrix.Dimension - 1;
 }
示例#10
0
        /// <summary>
        /// Construct clusters using different update methods
        /// </summary>
        /// <param name="distanceMatrix">IDistanceMatrix</param>
        /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param>
        public HierarchicalClusteringParallel(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName)
        {
            if (distanceMatrix.Dimension <= 0)
            {
                throw new Exception("Invalid distance matrix dimension");
            }

            // The number of nodes in the final tree is 2N-2:
            // N sequence nodes (leaves) and N-2 internal nodes
            // where N is the number of input sequences
            _nodes = new List <BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 2);
            _edges = new List <BinaryGuideTreeEdge>();

            // The number of clusters is the number of leaves at the beginning
            // As the algorithm merges clusters, only one cluster remains.
            _clusters = new List <int>(distanceMatrix.Dimension);

            // Choose a update-distance method
            switch (updateDistanceMethodName)
            {
            case (UpdateDistanceMethodsTypes.Aaverage):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage);
                break;

            case (UpdateDistanceMethodsTypes.Single):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle);
                break;

            case (UpdateDistanceMethodsTypes.Complete):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete);
                break;

            case (UpdateDistanceMethodsTypes.WeightedMAFFT):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT);
                break;

            default:
                throw new Exception("invalid update method");
            }

            // Initialize the clusters
            Initialize(distanceMatrix);

            // Clustering...
            while (_numberOfClusters > 1)
            {
                GetNextPairOfCluster(distanceMatrix);
                CreateCluster();
                UpdateDistance(distanceMatrix);
                UpdateClusters();
            }
        }
示例#11
0
        /// <summary>
        /// O(N) algorithm to get the next closest pair of clusters
        /// </summary>
        /// <param name="distanceMatrix">distance matrix</param>
        private void GetNextPairOfCluster(IDistanceMatrix distanceMatrix)
        {
            _smallestDistance = float.MaxValue;

            Parallel.ForEach(_clusters, PAMSAMMultipleSequenceAligner.parallelOption, i =>
            {
                int currentIndex = _nodes[i].SequenceID;
                _currentDistance = distanceMatrix.NearestDistances[currentIndex];
                if (_currentDistance < _smallestDistance)
                {
                    _smallestDistance = _currentDistance;
                    _nextA            = i;
                    _nextB            = _indexToCluster[distanceMatrix.NearestNeighbors[currentIndex]];
                }
            });
        }
示例#12
0
        /// <summary>
        /// Update the nearest neighbor and nearest distance of a column in a distance matrix
        /// </summary>
        /// <param name="distanceMatrix">distance matrix</param>
        /// <param name="column">zero-based integer</param>
        private void UpdateNearestColumn(IDistanceMatrix distanceMatrix, int column)
        {
            float min = float.MaxValue;

            foreach (int i in _clusters)
            {
                int currentIndex = Nodes[i].SequenceID;
                if (currentIndex != column)
                {
                    if (distanceMatrix[currentIndex, column] < min)
                    {
                        distanceMatrix.NearestDistances[column] = min;
                        distanceMatrix.NearestNeighbors[column] = currentIndex;
                    }
                }
            }
        }
示例#13
0
        /// <summary>
        /// Combine cluster nextA and nextB into a new cluster
        /// </summary>
        /// <param name="distanceMatrix">distance matrix</param>
        private void CreateCluster(IDistanceMatrix distanceMatrix)
        {
            BinaryGuideTreeNode node = new BinaryGuideTreeNode(++_currentClusterID);

            // link the two nodes nextA and nextB with the new node
            node.LeftChildren    = Nodes[_nextA];
            node.RightChildren   = Nodes[_nextB];
            Nodes[_nextA].Parent = node;
            Nodes[_nextB].Parent = node;

            // use the leftmost leave's sequenceID
            int next = Math.Min(_nextA, _nextB);

            node.SequenceID = Nodes[next].SequenceID;
            _indexToCluster[node.SequenceID] = _currentClusterID;

            Nodes.Add(node);

            // Add edges
            BinaryGuideTreeEdge edgeA = new BinaryGuideTreeEdge(Nodes[_nextA].ID);
            BinaryGuideTreeEdge edgeB = new BinaryGuideTreeEdge(Nodes[_nextB].ID);

            edgeA.ParentNode = node;
            edgeB.ParentNode = node;
            edgeA.ChildNode  = Nodes[_nextA];
            edgeB.ChildNode  = Nodes[_nextB];

            Nodes[_nextA].ParentEdge = edgeA;
            Nodes[_nextB].ParentEdge = edgeB;

            // the length of the edge is the percent identity of two node sequences
            // or the average of identities between two sets of sequences
            //_edge1.Length = KimuraDistanceScoreCalculator.calculateDistanceScore(
            //    seqs[nodes[nextA].sequenceID], seqs[nodes[nextB].sequenceID]);

            // modified: define kimura distance as sequence distance
            edgeA.Length = _smallestDistance;
            edgeB.Length = _smallestDistance;

            _edges.Add(edgeA);
            _edges.Add(edgeB);
        }
示例#14
0
        /// <summary>
        /// Run K-medoid clustering.
        /// </summary>
        /// <param name="data">data matrix with n rows</param>
        /// <param name="distance"></param>
        /// <param name="k">number of clusters k &lt; n</param>
        /// <returns>Array of length n. <code>assignment[i]</code> returns the index of the cluster medoid in the data matrix.</returns>
        public static int[] GenerateClusters(MatrixIndexer data, IDistanceMatrix distance, int k)
        {
            var n           = data.RowCount;
            var medoids     = SelectInitialMedoids(distance, k, n);
            var assignments = AssignClusters(distance, medoids, n);
            var cost        = CalculateCost(distance, assignments);

            while (true)
            {
                medoids     = SelectMedoids(distance, assignments);
                assignments = AssignClusters(distance, medoids, n);
                var newCost = CalculateCost(distance, assignments);
                if (Math.Abs(newCost - cost) < 1E-10)
                {
                    break;
                }
                cost = newCost;
            }
            return(assignments);
        }
示例#15
0
 internal static int[] AssignClusters(IDistanceMatrix distance, int[] medoids, int n)
 {
     int[] assignments = new int[n];
     for (int point = 0; point < n; point++)
     {
         double d        = double.PositiveInfinity;
         int    assigned = 0;
         foreach (int medoid in medoids)
         {
             double dNew = distance[point, medoid];
             if (dNew < d)
             {
                 d        = dNew;
                 assigned = medoid;
             }
         }
         assignments[point] = assigned;
     }
     return(assignments);
 }
示例#16
0
        /// <summary>
        /// O(N^2) algorithm to get the next closest pair of clusters
        /// </summary>
        /// <param name="distanceMatrix">distance matrix</param>
        private void GetNextPairOfCluster(IDistanceMatrix distanceMatrix)
        {
            _nextA            = _clusters[0];
            _nextB            = _clusters[1];
            _smallestDistance = distanceMatrix[Nodes[_nextA].SequenceID, Nodes[_nextB].SequenceID];

            for (int i = 0; i < _clusters.Count - 1; ++i)
            {
                for (int j = i + 1; j < _clusters.Count; ++j)
                {
                    _currentDistance = distanceMatrix[Nodes[_clusters[i]].SequenceID, Nodes[_clusters[j]].SequenceID];
                    if (_currentDistance < _smallestDistance)
                    {
                        _smallestDistance = _currentDistance;
                        _nextA            = _clusters[i];
                        _nextB            = _clusters[j];
                    }
                }
            }
        }
        /// <summary>
        /// Generate a symmetric distance matrix from a set of aligned sequences.
        /// </summary>
        /// <param name="sequences">a set of aligned sequences</param>
        public void GenerateDistanceMatrix(IList<ISequence> sequences)
        {
            if (sequences.Count <= 0)
            {
                throw new ArgumentException("empty sequence dataset");
            }

            // Construct a symmetric distance matrix
            _distanceMatrix = new SymmetricDistanceMatrix(sequences.Count);

            // Fill in values
            Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.ParallelOption, row =>
            {
                for (int col = 0; col < row; ++col)
                {
                    float distanceScore = KimuraDistanceScoreCalculator.CalculateDistanceScore(sequences[row], sequences[col]);
                    _distanceMatrix[row, col] = distanceScore;
                    _distanceMatrix[col, row] = distanceScore;
                }
            });
        }
示例#18
0
        /// <summary>
        /// Generate a symmetric distance matrix from a set of aligned sequences.
        /// </summary>
        /// <param name="sequences">a set of aligned sequences</param>
        public void GenerateDistanceMatrix(IList <ISequence> sequences)
        {
            if (sequences.Count <= 0)
            {
                throw new ArgumentException("empty sequence dataset");
            }

            // Construct a symmetric distance matrix
            _distanceMatrix = new SymmetricDistanceMatrix(sequences.Count);

            // Fill in values
            Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.ParallelOption, row =>
            {
                for (int col = 0; col < row; ++col)
                {
                    float distanceScore       = KimuraDistanceScoreCalculator.CalculateDistanceScore(sequences[row], sequences[col]);
                    _distanceMatrix[row, col] = distanceScore;
                    _distanceMatrix[col, row] = distanceScore;
                }
            });
        }
示例#19
0
        /// <summary>
        /// Construct clusters using different update methods
        /// </summary>
        /// <param name="distanceMatrix">IDistanceMatrix</param>
        /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param>
        public HierarchicalClusteringParallel(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName)
            : base(distanceMatrix, updateDistanceMethodName)
        {
            // Initialize the clusters
            Initialize(distanceMatrix);

            // Clustering...
            while (_numberOfClusters > 1)
            {
                try
                {
                    GetNextPairOfCluster(distanceMatrix);
                    CreateCluster(distanceMatrix);
                    UpdateClusters();
                    UpdateDistance(distanceMatrix);
                }
                catch (OutOfMemoryException ex)
                {
                    throw new Exception("Our of memory", ex.InnerException);
                }
            }
        }
示例#20
0
 //*****************************************
 // MAFFT: multiple sequence alignment program
 // Copyright (c) 2006 Kazutaka Katoh
 //
 // Redistribution and use in source and binary forms,
 // with or without modification, are permitted provided
 // that the following conditions are met:
 //
 // Redistributions of source code must retain the
 // above copyright notice, this list of conditions
 // and the following disclaimer.  Redistributions in
 // binary form must reproduce the above copyright
 // notice, this list of conditions and the following
 // disclaimer in the documentation and/or other
 // materials provided with the distribution.
 //
 // The name of the author may not be used to endorse
 // or promote products derived from this software without
 // specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
 // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 // AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 // OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 // OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 // AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
 // OF SUCH DAMAGE.
 //*********************************************
 /// <summary>
 /// Adapted from MAFFT software:
 /// weighted mixture of minimum and average linkage 
 /// d = (1-s)*d_min + s*d_avg
 /// where s is 0.1 by default
 /// </summary>
 /// 
 /// <param name="distanceMatrix">distance matrix for the cluster</param>
 /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
 /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
 /// <param name="other">the other cluster whose distance will be updated</param>
 protected float UpdateWeightedMAFFT(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
 {
     return (float)(0.9*Math.Min(distanceMatrix[nextA, other], distanceMatrix[nextB, other]) 
                 + 0.1*(distanceMatrix[nextA, other] + distanceMatrix[nextB, other]) / 2);
 }
示例#21
0
 /// <summary>
 /// Construct clusters based on distance matrix.
 /// The default distance update method is 'average'
 /// </summary>
 /// <param name="distanceMatrix">IDistanceMatrix</param>
 public HierarchicalClustering(IDistanceMatrix distanceMatrix) 
                 : this(distanceMatrix, UpdateDistanceMethodsTypes.Single)
 {
 }
示例#22
0
 /// <summary>
 /// Construct clusters based on distance matrix.
 /// The default distance update method is 'average'
 /// </summary>
 /// <param name="distanceMatrix">IDistanceMatrix</param>
 public HierarchicalClusteringParallel(IDistanceMatrix distanceMatrix)
     : this(distanceMatrix, UpdateDistanceMethodsTypes.Aaverage)
 {
 }
 public static double LanceWillamsSingleLinkage(this IDistanceMatrix dm, HCCluster joinedA, HCCluster joinedB, HCCluster other)
 {
     return(0.5 * dm.GetDistance(joinedA, other)
            + 0.5 * dm.GetDistance(joinedB, other)
            - 0.5 * Math.Abs(dm.GetDistance(joinedA, other) - dm.GetDistance(joinedB, other)));
 }
示例#24
0
 public HCBaseAlgorithm(IDistanceMatrix distanceMatrix)
 {
     this.distanceMatrix = distanceMatrix;
 }
示例#25
0
 /// <summary>
 /// Construct clusters based on distance matrix.
 /// The default distance update method is 'average'
 /// </summary>
 /// <param name="distanceMatrix">IDistanceMatrix</param>
 public HierarchicalClustering(IDistanceMatrix distanceMatrix)
     : this(distanceMatrix, UpdateDistanceMethodsTypes.Single)
 {
 }
示例#26
0
 /// <summary>
 /// Maximum of distance[nextA,other] and distance[nextB,other]
 /// </summary>
 /// <param name="distanceMatrix">distance matrix for the cluster</param>
 /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
 /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
 /// <param name="other">the other cluster whose distance will be updated</param>
 protected float UpdateComplete(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
 {
     return(Math.Max(distanceMatrix[nextA, other], distanceMatrix[nextB, other]));
 }
示例#27
0
        /// <summary>
        /// Construct clusters using different update methods
        /// </summary>
        /// <param name="distanceMatrix">IDistanceMatrix</param>
        /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param>
        public HierarchicalClustering(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName)
        {
            if (distanceMatrix.Dimension <= 0)
            {
                throw new Exception("Invalid distance matrix dimension");
            }

            try
            {
                // The number of nodes in the final tree is 2N-2:
                // N sequence nodes (leaves) and N-2 internal nodes
                // where N is the number of input sequences
                _nodes = new List<BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 1);
                _edges = new List<BinaryGuideTreeEdge>(distanceMatrix.Dimension * 2 - 2);

                // The number of clusters is the number of leaves at the beginning
                // As the algorithm merges clusters, only one cluster remains.
                _clusters = new List<int>(distanceMatrix.Dimension);
                
                // Construct _indexToCluster
                _indexToCluster = new int[distanceMatrix.Dimension];
                for (int i = 0; i < distanceMatrix.Dimension; ++i)
                {
                    _indexToCluster[i] = i;
                }
            }
            catch (OutOfMemoryException ex)
            {
                throw new Exception("Out of memory", ex.InnerException);
            }

            // Choose a update-distance method
            switch(updateDistanceMethodName)
            {
                case(UpdateDistanceMethodsTypes.Average):
                    _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage);
                    break;
                case(UpdateDistanceMethodsTypes.Single):
                    _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle);
                    break;
                case(UpdateDistanceMethodsTypes.Complete):
                    _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete);
                    break;
                case(UpdateDistanceMethodsTypes.WeightedMAFFT):
                    _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT);
                    break;
                default:
                    throw new Exception("invalid update method");
            }
        }
        /// <summary>
        /// Generate a symmetric distance matrix from a set of unaligned sequences.
        /// </summary>
        /// <param name="sequences">a set of unaligned sequences</param>
        public void GenerateDistanceMatrix(IList<ISequence> sequences)
        {
            // Generate k-mer counting dictionary for each sequence
            try
            {
                _allCountsDictionary = new Dictionary<string,float>[sequences.Count];

                Parallel.For(0, sequences.Count, i =>
                {
                    Dictionary<string, float> currentDictionary = KmerDistanceScoreCalculator.CalculateKmerCounting(sequences[i], _kmerLength);
                    MsaUtils.Normalize(currentDictionary);
                    _allCountsDictionary[i] = currentDictionary;
                });
            }
            catch (OutOfMemoryException ex)
            {
                throw new Exception("Out of memory when generating kmer counting", ex.InnerException);
            }

            // Construct a SymmetricDistanceMatrix
            // with dimension equals to the number of sequences
            _distanceMatrix = new SymmetricDistanceMatrix(sequences.Count);

            // Fill in DistanceMatrix
            Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.ParallelOption, row =>
            {
                for (int col = 0; col < row; ++col)
                {
                    float distanceScore = _kmerScoreCalculator.CalculateDistanceScore
                                (_allCountsDictionary[row], _allCountsDictionary[col]);
                    _distanceMatrix[row, col] = distanceScore;
                    _distanceMatrix[col, row] = distanceScore;
                }
            });
        }
示例#29
0
        /// <summary>
        ///     Validate Distance Matrix
        /// </summary>
        /// <param name="nodeName">xml node name.</param>
        /// <param name="matrix">distance matrix</param>
        private void ValidateDistanceMatrix(string nodeName, IDistanceMatrix matrix)
        {
            // Read expected values from config
            string expectedDimension = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.Dimension);
            string expectedMinimumValue = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.MinimumValue);
            string expectedNearestDistances = utilityObj.xmlUtil.GetTextValue(nodeName,
                                                                              Constants.NearestDistances);

            // Validate values in distance matrix
            Assert.AreEqual(expectedDimension, matrix.Dimension.ToString((IFormatProvider) null));
            Assert.IsTrue(expectedMinimumValue.Contains(matrix.MinimumValue.ToString((IFormatProvider) null)));

            for (int idist = 0; idist < matrix.NearestDistances.Length; idist++)
            {
                Assert.IsTrue(expectedNearestDistances.Contains(
                    matrix.NearestDistances[idist].ToString((IFormatProvider) null)));
            }
        }
示例#30
0
        /// <summary>
        ///     Get Hierarchical Clustering using kmerdistancematrix\kimura distance matrix and hierarchical method name.
        /// </summary>
        /// <param name="distanceMatrix">distance matrix.</param>
        /// <param name="hierarchicalClusteringMethodName">Hierarchical clustering method name.</param>
        /// <returns>Hierarchical clustering</returns>
        private static IHierarchicalClustering GetHierarchicalClustering(IDistanceMatrix distanceMatrix,
                                                                         UpdateDistanceMethodsTypes
                                                                             hierarchicalClusteringMethodName)
        {
            // Hierarchical clustering
            IHierarchicalClustering hierarcicalClustering =
                new HierarchicalClusteringParallel(distanceMatrix, hierarchicalClusteringMethodName);

            return hierarcicalClustering;
        }
示例#31
0
        /// <summary>
        ///     Get Hierarchical Clustering using kmerdistancematrix\kimura distance matrix.
        /// </summary>
        /// <param name="distanceMatrix"></param>
        /// <param name="hierarchicalClusteringMethodName"></param>
        /// <returns>Hierarchical clustering</returns>
        private static IHierarchicalClustering GetHierarchicalClustering(IDistanceMatrix distanceMatrix)
        {
            // Hierarchical clustering with default distance method name
            IHierarchicalClustering hierarcicalClustering =
                new HierarchicalClusteringParallel(distanceMatrix);

            return hierarcicalClustering;
        }
示例#32
0
 /// <summary>
 /// Minimum of distance[nextA,other] and distance[nextB,other]
 /// </summary>
 /// <param name="distanceMatrix">distance matrix for the cluster</param>
 /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
 /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
 /// <param name="other">the other cluster whose distance will be updated</param>
 protected float UpdateSingle(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
 {
     return Math.Min(distanceMatrix[nextA, other], distanceMatrix[nextB, other]);
 }
示例#33
0
 // Check out enum UpdateDistanceMethodsTypes for details
 /// <summary>
 /// arithmetic average of distance[nextA,other] and distance[nextB,other]
 /// </summary>
 /// <param name="distanceMatrix">distance matrix for the cluster</param>
 /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
 /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
 /// <param name="other">the other cluster whose distance will be updated</param>
 protected float UpdateAverage(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
 {
     return((distanceMatrix[nextA, other] + distanceMatrix[nextB, other]) / 2);
 }
示例#34
0
 /// <summary>
 /// Minimum of distance[nextA,other] and distance[nextB,other]
 /// </summary>
 /// <param name="distanceMatrix">distance matrix for the cluster</param>
 /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
 /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
 /// <param name="other">the other cluster whose distance will be updated</param>
 protected float UpdateSingle(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
 {
     return(Math.Min(distanceMatrix[nextA, other], distanceMatrix[nextB, other]));
 }
示例#35
0
 // Check out enum UpdateDistanceMethodsTypes for details
 private float UpdateAverage(IDistanceMatrix dm, int next1, int next2, int other)
 {
     return((dm[next1, other] + dm[next2, other]) / 2);
 }
示例#36
0
 /// <summary>
 /// Adapted from MAFFT software:
 /// weighted mixture of minimum and average linkage
 /// d = (1-s)*d_min + s*d_avg
 /// where s is 0.1 by default
 /// </summary>
 /// <param name="distanceMatrix">distance matrix for the cluster</param>
 /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
 /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
 /// <param name="other">the other cluster whose distance will be updated</param>
 protected float UpdateWeightedMAFFT(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
 {
     return((float)(0.9 * Math.Min(distanceMatrix[nextA, other], distanceMatrix[nextB, other])
                    + 0.1 * (distanceMatrix[nextA, other] + distanceMatrix[nextB, other]) / 2));
 }
示例#37
0
 private float UpdateSingle(IDistanceMatrix dm, int next1, int next2, int other)
 {
     return(Math.Min(dm[next1, other], dm[next2, other]));
 }
示例#38
0
 /// <summary>
 /// Maximum of distance[nextA,other] and distance[nextB,other]
 /// </summary>
 /// <param name="distanceMatrix">distance matrix for the cluster</param>
 /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
 /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
 /// <param name="other">the other cluster whose distance will be updated</param>
 protected float UpdateComplete(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
 {
     return Math.Max(distanceMatrix[nextA, other], distanceMatrix[nextB, other]);
 }
示例#39
0
 private float UpdateComplete(IDistanceMatrix dm, int next1, int next2, int other)
 {
     return(Math.Max(dm[next1, other], dm[next2, other]));
 }
示例#40
0
 // Check out enum UpdateDistanceMethodsTypes for details
 /// <summary>
 /// arithmetic average of distance[nextA,other] and distance[nextB,other]
 /// </summary>
 /// <param name="distanceMatrix">distance matrix for the cluster</param>
 /// <param name="nextA">integer number of sequence 1 to be clustered next</param>
 /// <param name="nextB">integer number of sequence 2 to be clustered next</param>
 /// <param name="other">the other cluster whose distance will be updated</param>
 protected float UpdateAverage(IDistanceMatrix distanceMatrix, int nextA, int nextB, int other)
 {
     return (distanceMatrix[nextA, other] + distanceMatrix[nextB, other]) / 2;
 }
示例#41
0
 private float UpdateWeightedMAFFT(IDistanceMatrix dm, int next1, int next2, int other)
 {
     return((float)(0.9 * Math.Min(dm[next1, other], dm[next2, other])
                    + 0.1 * (dm[next1, other] + dm[next2, other]) / 2));
 }