Beispiel #1
0
        public void testBug2()
        {
            //Test on DNA benchmark dataset
            string      filepath = @"TestUtils\122_raw.afa".TestDir();
            FastAParser parser   = new FastAParser();

            IList <ISequence> orgSequences = parser.Parse(filepath).ToList();

            List <ISequence> sequences = MsaUtils.UnAlign(orgSequences);

            PAMSAMMultipleSequenceAligner.FasterVersion = false;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = false;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            int gapOpenPenalty     = -13;
            int gapExtendPenalty   = -5;
            int kmerLength         = 2;
            int numberOfDegrees    = 2;  //Environment.ProcessorCount;
            int numberOfPartitions = 16; // Environment.ProcessorCount * 2;

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast;

            SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna);

            PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                    (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                    profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                    numberOfPartitions, numberOfDegrees);

            Assert.IsNotNull(msa.AlignedSequences);
        }
Beispiel #2
0
        public void testBug3()
        {
            //Test on DNA benchmark dataset
            ISequenceParser parser   = new FastaParser();
            string          filepath = @"TestUtils\122_raw.afa";

            MoleculeType mt = MoleculeType.DNA;

            IList <ISequence> orgSequences = parser.Parse(filepath);

            List <ISequence> sequences = MsaUtils.UnAlign(orgSequences);

            PAMSAMMultipleSequenceAligner.FasterVersion = false;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = false;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            int gapOpenPenalty   = -13;
            int gapExtendPenalty = -5;
            int kmerLength       = 2;

            int numberOfDegrees    = 2;  //Environment.ProcessorCount;
            int numberOfPartitions = 16; // Environment.ProcessorCount * 2;


            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast;

            SimilarityMatrix similarityMatrix = null;

            switch (mt)
            {
            case (MoleculeType.DNA):
                similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna);
                break;

            case (MoleculeType.RNA):
                similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna);
                break;

            case (MoleculeType.Protein):
                similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62);
                break;

            default:
                throw new InvalidDataException("Invalid molecular type");
            }

            //DateTime startTime = DateTime.Now;
            PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                    (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                    profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                    numberOfPartitions, numberOfDegrees);

            Assert.IsNotNull(msa.AlignedSequences);

            ((FastaParser)parser).Dispose();
        }
Beispiel #3
0
        /// <summary>
        /// Construct clusters using different update methods
        /// </summary>
        /// <param name="distanceMatrix">IDistanceMatrix</param>
        /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param>
        public HierarchicalClustering(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName)
        {
            if (distanceMatrix.Dimension <= 0)
            {
                throw new Exception("Invalid distance matrix dimension");
            }

            try
            {
                // The number of nodes in the final tree is 2N-2:
                // N sequence nodes (leaves) and N-2 internal nodes
                // where N is the number of input sequences
                _nodes = new List <BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 1);
                _edges = new List <BinaryGuideTreeEdge>(distanceMatrix.Dimension * 2 - 2);

                // The number of clusters is the number of leaves at the beginning
                // As the algorithm merges clusters, only one cluster remains.
                _clusters = new List <int>(distanceMatrix.Dimension);

                // Construct _indexToCluster
                _indexToCluster = new int[distanceMatrix.Dimension];
                for (int i = 0; i < distanceMatrix.Dimension; ++i)
                {
                    _indexToCluster[i] = i;
                }
            }
            catch (OutOfMemoryException ex)
            {
                throw new Exception("Out of memory", ex.InnerException);
            }

            // Choose a update-distance method
            switch (updateDistanceMethodName)
            {
            case (UpdateDistanceMethodsTypes.Average):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage);
                break;

            case (UpdateDistanceMethodsTypes.Single):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle);
                break;

            case (UpdateDistanceMethodsTypes.Complete):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete);
                break;

            case (UpdateDistanceMethodsTypes.WeightedMAFFT):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT);
                break;

            default:
                throw new Exception("invalid update method");
            }
        }
Beispiel #4
0
        /// <summary>
        /// Construct clusters using different update methods
        /// </summary>
        /// <param name="distanceMatrix">IDistanceMatrix</param>
        /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param>
        public HierarchicalClusteringParallel(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName)
        {
            if (distanceMatrix.Dimension <= 0)
            {
                throw new Exception("Invalid distance matrix dimension");
            }

            // The number of nodes in the final tree is 2N-2:
            // N sequence nodes (leaves) and N-2 internal nodes
            // where N is the number of input sequences
            _nodes = new List <BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 2);
            _edges = new List <BinaryGuideTreeEdge>();

            // The number of clusters is the number of leaves at the beginning
            // As the algorithm merges clusters, only one cluster remains.
            _clusters = new List <int>(distanceMatrix.Dimension);

            // Choose a update-distance method
            switch (updateDistanceMethodName)
            {
            case (UpdateDistanceMethodsTypes.Aaverage):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage);
                break;

            case (UpdateDistanceMethodsTypes.Single):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle);
                break;

            case (UpdateDistanceMethodsTypes.Complete):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete);
                break;

            case (UpdateDistanceMethodsTypes.WeightedMAFFT):
                _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT);
                break;

            default:
                throw new Exception("invalid update method");
            }

            // Initialize the clusters
            Initialize(distanceMatrix);

            // Clustering...
            while (_numberOfClusters > 1)
            {
                GetNextPairOfCluster(distanceMatrix);
                CreateCluster();
                UpdateDistance(distanceMatrix);
                UpdateClusters();
            }
        }
Beispiel #5
0
        public void TestMuscleMultipleSequenceAlignment()
        {
            ISequence templateSequence = new Sequence(Alphabets.DNA, "ATGCSWRYKMBVHDN-");
            Dictionary <ISequenceItem, int> itemSet = new Dictionary <ISequenceItem, int>();

            for (int i = 0; i < templateSequence.Count; ++i)
            {
                itemSet.Add(templateSequence[i], i);
            }
            Profiles.ItemSet = itemSet;

            SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrices.AmbiguousDna);
            int gapOpenPenalty   = -8;
            int gapExtendPenalty = -1;
            int kmerLength       = 3;

            ISequence        seqA      = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT");
            ISequence        seqB      = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG");
            ISequence        seqC      = new Sequence(Alphabets.DNA, "GGGACAAAATCAG");
            List <ISequence> sequences = new List <ISequence>();

            sequences.Add(seqA);
            sequences.Add(seqB);
            sequences.Add(seqC);

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclieanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Aaverage;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct;

            MuscleMultipleSequenceAlignment msa = new MuscleMultipleSequenceAlignment
                                                      (sequences, MoleculeType.DNA, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                      profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty);

            ISequence expectedSeqA = new Sequence(Alphabets.DNA, "GGGA---AAAATCAGATT");
            ISequence expectedSeqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG---");
            ISequence expectedSeqC = new Sequence(Alphabets.DNA, "GGGA--CAAAATCAG---");

            Assert.AreEqual(expectedSeqA.ToString(), msa.AlignedSequences[0].ToString());
            Assert.AreEqual(expectedSeqB.ToString(), msa.AlignedSequences[1].ToString());
            Assert.AreEqual(expectedSeqC.ToString(), msa.AlignedSequences[2].ToString());

            Assert.AreEqual(46, msa.AlignmentScore);
        }
Beispiel #6
0
        /// <summary>
        /// Construct clusters using different update methods
        /// </summary>
        /// <param name="distanceMatrix">IDistanceMatrix</param>
        /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param>
        public HierarchicalClusteringParallel(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName)
            : base(distanceMatrix, updateDistanceMethodName)
        {
            // Initialize the clusters
            Initialize(distanceMatrix);

            // Clustering...
            while (_numberOfClusters > 1)
            {
                try
                {
                    GetNextPairOfCluster(distanceMatrix);
                    CreateCluster(distanceMatrix);
                    UpdateClusters();
                    UpdateDistance(distanceMatrix);
                }
                catch (OutOfMemoryException ex)
                {
                    throw new Exception("Our of memory", ex.InnerException);
                }
            }
        }
        // $TODO: Change the above namespace after PhaseOne changes
        /// <summary>
        /// Aligns multiple sequences using a multiple sequence aligner.
        /// This sample uses PAMSAM with a set of default parameters.
        /// </summary>
        /// <param name="sequences">List of sequences to align.</param>
        /// <returns>List of ISequenceAlignment</returns>
        static IList <ISequence> DoMultipleSequenceAlignment(List <ISequence> sequences)
        {
            // $TODO: Change the signature after PAMSAM PhaseOne is checked in

            // Initialise objects for constructor
            // $TODO: Change this after PAMSAM PhaseOne is checked in
            SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna);
            int gapOpenPenalty   = -4;
            int gapExtendPenalty = -1;
            int kmerLength       = 3;

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct;

            // Call aligner
            PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                    (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                    profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                    Environment.ProcessorCount * 2, Environment.ProcessorCount);

            return(msa.AlignedSequences);
        }
Beispiel #8
0
        /// <summary>
        ///     Validate Muscle multiple sequence alignment with gap open cost and penalty.
        /// </summary>
        /// <param name="nodeName">xml node name</param>
        /// <param name="moleculeType">molecule type</param>
        /// <param name="expectedScoreNode">Expected score node</param>
        /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param>
        /// <param name="distanceFunctionName">kmerdistancematrix method name.</param>
        /// <param name="profileAlignerName">SW/NW profiler</param>
        /// <param name="profileScoreName">Profile score function name.</param>
        /// <param name="gpOpenPenalty">Gap open penalty</param>
        /// <param name="gpExtendPenalty">Gap extended penalty</param>
        /// <param name="IsAlignedLargeSeq">True for large sequence else false</param>
        private void ValidatePamsamAlignWithGapCost(
            string nodeName, MoleculeType moleculeType, string expectedScoreNode,
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
            DistanceFunctionTypes distanceFunctionName,
            ProfileAlignerNames profileAlignerName,
            ProfileScoreFunctionNames profileScoreName,
            int gpOpenPenalty, int gpExtendPenalty, bool IsAlignedLargeSeq)
        {
            Initialize(nodeName, expectedScoreNode);

            // MSA aligned sequences with sepcified gap costs.
            var msa = new PAMSAMMultipleSequenceAligner(lstSequences,
                                                        kmerLength, distanceFunctionName,
                                                        hierarchicalClusteringMethodName,
                                                        profileAlignerName, profileScoreName, similarityMatrix,
                                                        gpOpenPenalty,
                                                        gpExtendPenalty, 2, 2);

            // Validate the aligned Sequence and score
            int index = 0;
            foreach (ISequence seq in msa.AlignedSequences)
            {
                if (IsAlignedLargeSeq)
                {
                    Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()),
                                    new string(expectedSequences[index].Select(a => (char) a).ToArray()));
                    index++;
                }
            }

            Assert.IsTrue(expectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null)));
            ApplicationLog.WriteLine(String.Format(null,
                                                   "PamsamP1Test:: Pamsam alignment completed successfully with equal gap cost for {0} moleculetype with all default params",
                                                   moleculeType.ToString()));
        }
Beispiel #9
0
        /// <summary>
        ///     Validate Muscle multiple sequence alignment.
        /// </summary>
        /// <param name="nodeName">xml node name</param>
        /// <param name="expectedScoreNode">Expected score node</param>
        /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param>
        /// <param name="distanceFunctionName">kmerdistancematrix method name.</param>
        /// <param name="profileAlignerName">SW/NW profiler</param>
        /// <param name="profileScoreName">Profile score function name.</param>
        /// <param name="isWeightedProduct">True if it of the WeightedProduct type else false.</param>
        private void ValidatePamsamAlign(string nodeName,
                                         string expectedScoreNode,
                                         UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
                                         DistanceFunctionTypes distanceFunctionName,
                                         ProfileAlignerNames profileAlignerName,
                                         ProfileScoreFunctionNames profileScoreName,
                                         bool isWeightedProduct)
        {
            Initialize(nodeName, expectedScoreNode);

            // MSA aligned sequences.
            var msa = new PAMSAMMultipleSequenceAligner(lstSequences,
                                                        kmerLength, distanceFunctionName,
                                                        hierarchicalClusteringMethodName,
                                                        profileAlignerName, profileScoreName,
                                                        similarityMatrix, gapOpenPenalty, gapExtendPenalty, 2, 2);

            int index = 0;
            foreach (ISequence seq in msa.AlignedSequences)
            {
                if (isWeightedProduct)
                {
                    Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()),
                                    new string(expectedSequences[index].Select(a => (char) a).ToArray()));
                    index++;
                }
            }

            Assert.IsTrue(expectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null)));
        }
Beispiel #10
0
        public void TestMsaBenchMark()
        {
            string        fileDirectory = @"TestUtils\FASTA\Protein\Balibase\RV911\";
            DirectoryInfo iD            = new DirectoryInfo(fileDirectory);

            PAMSAMMultipleSequenceAligner.FasterVersion = false;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = true;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            SimilarityMatrix similarityMatrix;
            int gapOpenPenalty   = -20;
            int gapExtendPenalty = -5;
            int kmerLength       = 4;

            int numberOfDegrees    = 2;  //Environment.ProcessorCount;
            int numberOfPartitions = 16; // Environment.ProcessorCount * 2;

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached;

            similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62);

            List <float> allQ  = new List <float>();
            List <float> allTC = new List <float>();

            foreach (FileInfo fi in iD.GetFiles())
            {
                String filePath = fi.FullName;
                Console.WriteLine(filePath);
                FastAParser parser = new FastAParser(filePath);

                parser.Alphabet = AmbiguousProteinAlphabet.Instance;
                IList <ISequence> orgSequences = parser.Parse().ToList();

                List <ISequence> sequences = MsaUtils.UnAlign(orgSequences);

                int numberOfSequences = orgSequences.Count;

                Console.WriteLine("The number of sequences is: {0}", numberOfSequences);
                Console.WriteLine("Original unaligned sequences are:");
                for (int i = 0; i < numberOfSequences; ++i)
                {
                    //Console.WriteLine(sequences[i].ToString());
                }
                Console.WriteLine("Original aligned sequences are:");
                for (int i = 0; i < numberOfSequences; ++i)
                {
                    //Console.WriteLine(orgSequences[i].ToString());
                }

                PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                        (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                        profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                        numberOfPartitions, numberOfDegrees);

                Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA);
                for (int i = 0; i < msa.AlignedSequencesA.Count; ++i)
                {
                    //Console.WriteLine(msa.AlignedSequencesA[i].ToString());
                }
                Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB);
                for (int i = 0; i < msa.AlignedSequencesB.Count; ++i)
                {
                    //Console.WriteLine(msa.AlignedSequencesB[i].ToString());
                }
                Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC);
                for (int i = 0; i < msa.AlignedSequencesC.Count; ++i)
                {
                    //Console.WriteLine(msa.AlignedSequencesC[i].ToString());
                }

                Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);
                for (int i = 0; i < msa.AlignedSequences.Count; ++i)
                {
                    //Console.WriteLine(msa.AlignedSequences[i].ToString());
                }
                float scoreQ  = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences);
                float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences);
                allQ.Add(scoreQ);
                allTC.Add(scoreTC);
                Console.WriteLine("Alignment score Q is: {0}", scoreQ);
                Console.WriteLine("Alignment score TC is: {0}", scoreTC);
                ((FastAParser)parser).Dispose();
            }
            Console.WriteLine("Number of datasets is: {0}", allQ.Count);
            Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray()));
            Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray()));
        }
Beispiel #11
0
        public void TestMsaBenchMarkOnBralibase()
        {
            List <float> allQ  = new List <float>();
            List <float> allTC = new List <float>();

            string        fileDirectory = @"testData\FASTA\RNA\k10";
            DirectoryInfo iD            = new DirectoryInfo(fileDirectory);

            PAMSAMMultipleSequenceAligner.FasterVersion = false;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = false;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            MoleculeType     mt = MoleculeType.RNA;
            SimilarityMatrix similarityMatrix;
            int gapOpenPenalty   = -20;
            int gapExtendPenalty = -5;
            int kmerLength       = 4;

            int numberOfDegrees    = 2;  //Environment.ProcessorCount;
            int numberOfPartitions = 16; // Environment.ProcessorCount * 2;

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached;

            switch (mt)
            {
            case (MoleculeType.DNA):
                similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna);
                break;

            case (MoleculeType.RNA):
                similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna);
                break;

            case (MoleculeType.Protein):
                similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62);
                break;

            default:
                throw new Exception("Invalid molecular type");
            }


            foreach (DirectoryInfo fi in iD.GetDirectories())
            {
                foreach (FileInfo fiii in fi.GetFiles())
                {
                    String filePath = fiii.FullName;
                    Console.WriteLine(filePath);
                    ISequenceParser parser = new FastaParser();

                    IList <ISequence> orgSequences = parser.Parse(filePath);

                    List <ISequence> sequences = MsaUtils.UnAlign(orgSequences);

                    int numberOfSequences = orgSequences.Count;

                    Console.WriteLine("The number of sequences is: {0}", numberOfSequences);
                    Console.WriteLine("Original unaligned sequences are:");

                    PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                            (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                            profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                            numberOfPartitions, numberOfDegrees);

                    Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);
                    for (int i = 0; i < msa.AlignedSequences.Count; ++i)
                    {
                        //Console.WriteLine(msa.AlignedSequences[i].ToString());
                    }
                    float scoreQ  = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences);
                    float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences);
                    allQ.Add(scoreQ);
                    allTC.Add(scoreTC);
                    Console.WriteLine("Alignment score Q is: {0}", scoreQ);
                    Console.WriteLine("Alignment score TC is: {0}", scoreTC);

                    if (allQ.Count % 1000 == 0)
                    {
                        Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
                        Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray()));
                        Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray()));
                    }
                }
            }
            Console.WriteLine("number of datasets is: {0}", allQ.Count);
            Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray()));
            Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray()));
        }
Beispiel #12
0
        public void TestMuscleMultipleSequenceAlignment()
        {
            SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna);
            int gapOpenPenalty   = -4;
            int gapExtendPenalty = -1;
            int kmerLength       = 3;

            ISequence        seqA      = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT");
            ISequence        seqB      = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG");
            ISequence        seqC      = new Sequence(Alphabets.DNA, "GGGACAAAATCAG");
            List <ISequence> sequences = new List <ISequence>();

            sequences.Add(seqA);
            sequences.Add(seqB);
            sequences.Add(seqC);

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct;

            PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                    (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                    profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                    Environment.ProcessorCount * 2, Environment.ProcessorCount);

            Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA);
            for (int i = 0; i < msa.AlignedSequencesA.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray()));
            }

            Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC);
            for (int i = 0; i < msa.AlignedSequencesC.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);

            for (int i = 0; i < msa.AlignedSequences.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray()));
            }

            // Test case 2
            Console.WriteLine("Example 2");
            sequences = new List <ISequence>();
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAAATCG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCTTATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG"));


            msa = new PAMSAMMultipleSequenceAligner
                      (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                      profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                      Environment.ProcessorCount * 2, Environment.ProcessorCount);

            Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA);
            for (int i = 0; i < msa.AlignedSequencesA.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray()));
            }

            Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC);
            for (int i = 0; i < msa.AlignedSequencesC.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);
            for (int i = 0; i < msa.AlignedSequences.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray()));
            }

            // Test case e
            Console.WriteLine("Example 2");
            sequences = new List <ISequence>();
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAAATCG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG"));
            sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCTTATCAG"));


            msa = new PAMSAMMultipleSequenceAligner
                      (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                      profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                      Environment.ProcessorCount * 2, Environment.ProcessorCount);

            Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA);
            for (int i = 0; i < msa.AlignedSequencesA.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray()));
            }

            Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC);
            for (int i = 0; i < msa.AlignedSequencesC.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);
            for (int i = 0; i < msa.AlignedSequences.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray()));
            }
        }
Beispiel #13
0
        public void PerformPAMSAMPerf()
        {
            Stopwatch _watchObj = new Stopwatch();

            // Get input values from XML.
            string refPath =
                Utility._xmlUtil.GetTextValue(Constants.PamsamNode,
                                              Constants.RefFilePathNode);
            string queryPath =
                Utility._xmlUtil.GetTextValue(Constants.PamsamNode,
                                              Constants.QueryFilePathNode);

            // Create a List for input files.
            List <string> lstInputFiles = new List <string>();

            lstInputFiles.Add(refPath);
            lstInputFiles.Add(queryPath);

            // Parse a Reference and query sequence file.
            ISequenceParser   parser       = new FastaParser();
            IList <ISequence> refsequences = parser.Parse(queryPath);
            IList <ISequence> orgSequences = parser.Parse(refPath);

            // Execute UnAlign method to verify that it does not contains gap
            List <ISequence> sequences = MsaUtils.UnAlign(orgSequences);

            // Set static properties
            PAMSAMMultipleSequenceAligner.FasterVersion = true;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = false;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            // Set Alignment parameters.
            int gapOpenPenalty     = -13;
            int gapExtendPenalty   = -5;
            int kmerLength         = 2;
            int numberOfDegrees    = 2;
            int numberOfPartitions = 4;

            // Profile Distance function name
            DistanceFunctionTypes distanceFunctionName =
                DistanceFunctionTypes.EuclideanDistance;

            // Set Hierarchical clustering.
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName =
                UpdateDistanceMethodsTypes.Average;

            // Set NeedlemanWunschProfileAligner
            ProfileAlignerNames profileAlignerName =
                ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames profileProfileFunctionName =
                ProfileScoreFunctionNames.InnerProduct;

            // Create similarity matrix instance.
            SimilarityMatrix similarityMatrix =
                new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna);

            // Reset stop watch and start timer.
            _watchObj.Reset();
            _watchObj.Start();
            long memoryStart = GC.GetTotalMemory(true);

            // Parallel Option will only get set if the PAMSAMMultipleSequenceAligner is getting called
            // To test separately distance matrix, binary tree etc..
            // Set the parallel option using below ctor.
            msa = new PAMSAMMultipleSequenceAligner
                      (sequences, MoleculeType.DNA, kmerLength, distanceFunctionName,
                      hierarchicalClusteringMethodName, profileAlignerName,
                      profileProfileFunctionName, similarityMatrix, gapOpenPenalty,
                      gapExtendPenalty, numberOfPartitions, numberOfDegrees);

            // Stop watchclock.
            _watchObj.Stop();
            long memoryEnd = GC.GetTotalMemory(true);

            string memoryUsed = (memoryEnd - memoryStart).ToString();

            // Display all aligned sequence, performance and memory optimization nos.
            DisplayTestCaseHeader(lstInputFiles, _watchObj,
                                  memoryUsed, "PAMSAM");

            Console.WriteLine(string.Format(
                                  "PAMSAM SequenceAligner method, Alignment Score is : {0}",
                                  msa.AlignmentScore.ToString()));
            int index = 0;

            foreach (ISequence seq in msa.AlignedSequences)
            {
                Console.WriteLine(string.Format(
                                      "PAMSAM Aligned Seq {0}:{1}", index, seq.ToString()));
                index++;
            }
        }
Beispiel #14
0
        /// <summary>
        /// Construct an aligner
        /// </summary>
        /// <param name="sequences">input sequences</param>
        /// <param name="kmerLength">positive integer of kmer length</param>
        /// <param name="distanceFunctionName">enum: distance function name</param>
        /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param>
        /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param>
        /// <param name="profileFunctionName">enum: profile-profile distance function</param>
        /// <param name="similarityMatrix">similarity matrix</param>
        /// <param name="gapOpenPenalty">negative gapOpenPenalty</param>
        /// <param name="gapExtendPenalty">negative gapExtendPenalty</param>
        /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param>
        /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param>
        public PAMSAMMultipleSequenceAligner(
            IList <ISequence> sequences,
            int kmerLength,
            DistanceFunctionTypes distanceFunctionName,
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
            ProfileAlignerNames profileAlignerMethodName,
            ProfileScoreFunctionNames profileFunctionName,
            SimilarityMatrix similarityMatrix,
            int gapOpenPenalty,
            int gapExtendPenalty,
            int numberOfPartitions,
            int degreeOfParallelism)
        {
            Performance.Start();

            if (null == sequences)
            {
                throw new ArgumentNullException("sequences");
            }

            if (sequences.Count == 0)
            {
                throw new ArgumentException("Empty input sequences");
            }

            // Set parallel extension option
            if (degreeOfParallelism <= 0)
            {
                throw new ArgumentException("Invalid parallel degree parameter");
            }
            PAMSAMMultipleSequenceAligner.parallelOption = new ParallelOptions {
                MaxDegreeOfParallelism = degreeOfParallelism
            };

            if (numberOfPartitions <= 0)
            {
                throw new ArgumentException("Invalid number of partition parameter");
            }
            _numberOfPartitions = numberOfPartitions;

            // Validate data type
            _alphabet = sequences[0].Alphabet;
            Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, i =>
            {
                if (!Alphabets.CheckIsFromSameBase(sequences[i].Alphabet, _alphabet))
                {
                    throw new ArgumentException("Inconsistent sequence alphabet");
                }
            });

            List <String> similarityMatrixDNA = new List <String>();

            similarityMatrixDNA.Add("AmbiguousDNA");

            List <String> similarityMatrixRNA = new List <String>();

            similarityMatrixRNA.Add("AmbiguousRNA");

            List <String> similarityMatrixProtein = new List <String>();

            similarityMatrixProtein.Add("BLOSUM45");
            similarityMatrixProtein.Add("BLOSUM50");
            similarityMatrixProtein.Add("BLOSUM62");
            similarityMatrixProtein.Add("BLOSUM80");
            similarityMatrixProtein.Add("BLOSUM90");
            similarityMatrixProtein.Add("PAM250");
            similarityMatrixProtein.Add("PAM30");
            similarityMatrixProtein.Add("PAM70");

            if (_alphabet is DnaAlphabet)
            {
                if (!similarityMatrixDNA.Contains(similarityMatrix.Name))
                {
                    throw new ArgumentException("Inconsistent similarity matrix");
                }
            }
            else if (_alphabet is ProteinAlphabet)
            {
                if (!similarityMatrixProtein.Contains(similarityMatrix.Name))
                {
                    throw new ArgumentException("Inconsistent similarity matrix");
                }
            }
            else if (_alphabet is RnaAlphabet)
            {
                if (!similarityMatrixRNA.Contains(similarityMatrix.Name))
                {
                    throw new ArgumentException("Inconsistent similarity matrix");
                }
            }
            else
            {
                throw new ArgumentException("Invalid alphabet");
            }

            // Initialize parameters
            _kmerLength                       = kmerLength;
            _distanceFunctionName             = distanceFunctionName;
            _hierarchicalClusteringMethodName = hierarchicalClusteringMethodName;
            _profileAlignerName               = profileAlignerMethodName;
            _profileProfileFunctionName       = profileFunctionName;
            SimilarityMatrix                  = similarityMatrix;
            GapOpenCost                       = gapOpenPenalty;
            GapExtensionCost                  = gapExtendPenalty;

            MsaUtils.SetProfileItemSets(_alphabet);

            Performance.Snapshot("Start Aligning");

            // Work...
            Align(sequences);
        }
Beispiel #15
0
        public void TestMsaBenchMarkLargeDataset()
        {
            // Test on DNA benchmark dataset
            string filePathObj  = @"TestUtils\BOX032Small.xml.afa".TestDir();
            var    orgSequences = new FastAParser().Parse(filePathObj).ToList();

            var sequences         = MsaUtils.UnAlign(orgSequences);
            int numberOfSequences = orgSequences.Count;

            Assert.AreEqual(numberOfSequences, sequences.Count);

            string outputFilePath = Path.GetTempFileName();

            try
            {
                using (StreamWriter writer = new StreamWriter(outputFilePath, true))
                {
                    foreach (ISequence sequence in sequences)
                    {
                        // write sequence
                        writer.WriteLine(">" + sequence.ID);
                        for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60)
                        {
                            writer.WriteLine(new String(sequence.Skip(lineStart).Take((int)Math.Min(60, sequence.Count - lineStart)).Select(a => (char)a).ToArray()));
                        }
                        writer.Flush();
                    }
                }
                sequences = new FastAParser().Parse(outputFilePath).ToList();
            }
            finally
            {
                File.Delete(outputFilePath);
            }

            Console.WriteLine("Original sequences are:");
            sequences.ForEach(Console.WriteLine);

            Console.WriteLine("Benchmark sequences are:");
            orgSequences.ForEach(Console.WriteLine);

            // Begin alignment
            PAMSAMMultipleSequenceAligner.FasterVersion = false;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = true;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            int gapOpenPenalty     = -13;
            int gapExtendPenalty   = -5;
            int kmerLength         = 3;
            int numberOfDegrees    = 2;
            int numberOfPartitions = 16;

            SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62);

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct;

            PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                    (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                    profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                    numberOfPartitions, numberOfDegrees);

            Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty));

            Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA);
            for (int i = 0; i < msa.AlignedSequencesA.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequencesA[i]);
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences));

            Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB);
            for (int i = 0; i < msa.AlignedSequencesB.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequencesB[i]);
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences));

            Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC);
            for (int i = 0; i < msa.AlignedSequencesC.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequencesC[i]);
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences));

            Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);
            for (int i = 0; i < msa.AlignedSequences.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequences[i]);
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences));
        }
Beispiel #16
0
        public void TestMsaBenchMarkOnSABmark()
        {
            List <float> allQ  = new List <float>();
            List <float> allTC = new List <float>();

            string        fileDirectory = @"TestUtils\Fasta\Protein\SABmark".TestDir();
            DirectoryInfo iD            = new DirectoryInfo(fileDirectory);

            PAMSAMMultipleSequenceAligner.FasterVersion = false;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = true;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            SimilarityMatrix similarityMatrix;
            int gapOpenPenalty   = -13;
            int gapExtendPenalty = -5;
            int kmerLength       = 3;

            int numberOfDegrees    = 2;  //Environment.ProcessorCount;
            int numberOfPartitions = 16; // Environment.ProcessorCount * 2;

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct;

            similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62);

            foreach (DirectoryInfo fi in iD.GetDirectories())
            {
                foreach (DirectoryInfo fii in fi.GetDirectories())
                {
                    foreach (FileInfo fiii in fii.GetFiles())
                    {
                        String filePath = fiii.FullName;
                        Console.WriteLine(filePath);
                        FastAParser parser = new FastAParser();

                        IList <ISequence> orgSequences = parser.Parse(filePath).ToList();

                        List <ISequence> sequences = MsaUtils.UnAlign(orgSequences);

                        int numberOfSequences = orgSequences.Count;

                        Console.WriteLine("The number of sequences is: {0}", numberOfSequences);
                        Console.WriteLine("Original unaligned sequences are:");

                        PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                                (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                                profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                                numberOfPartitions, numberOfDegrees);

                        Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);
                        for (int i = 0; i < msa.AlignedSequences.Count; ++i)
                        {
                            //Console.WriteLine(msa.AlignedSequences[i].ToString());
                        }
                        float scoreQ  = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences);
                        float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences);
                        allQ.Add(scoreQ);
                        allTC.Add(scoreTC);
                        Console.WriteLine("Alignment score Q is: {0}", scoreQ);
                        Console.WriteLine("Alignment score TC is: {0}", scoreTC);

                        if (allQ.Count % 1000 == 0)
                        {
                            Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
                            Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray()));
                            Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray()));
                        }
                    }
                }
            }

            Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray()));
            Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray()));
        }
Beispiel #17
0
        /// <summary>
        ///     Get Hierarchical Clustering using kmerdistancematrix\kimura distance matrix and hierarchical method name.
        /// </summary>
        /// <param name="distanceMatrix">distance matrix.</param>
        /// <param name="hierarchicalClusteringMethodName">Hierarchical clustering method name.</param>
        /// <returns>Hierarchical clustering</returns>
        private static IHierarchicalClustering GetHierarchicalClustering(IDistanceMatrix distanceMatrix,
                                                                         UpdateDistanceMethodsTypes
                                                                             hierarchicalClusteringMethodName)
        {
            // Hierarchical clustering
            IHierarchicalClustering hierarcicalClustering =
                new HierarchicalClusteringParallel(distanceMatrix, hierarchicalClusteringMethodName);

            return hierarcicalClustering;
        }
Beispiel #18
0
        /// <summary>
        ///     Validate Muscle multiple sequence alignment with static properties
        ///     of PamsamMultipleSequenceAligner.
        /// </summary>
        /// <param name="nodeName">xml node name</param>
        /// <param name="hierarchicalClusteringMethodName"></param>
        /// <param name="distanceFunctionName">kmerdistancematrix method name.</param>
        /// <param name="profileAlignerName"></param>
        /// <param name="profileScoreName">Profile score function name.</param>
        /// <param name="useweights">use sequence weights true\false</param>
        /// <param name="fasterVersion">fasterversion true\false</param>
        /// <param name="useStageB">stage2 computation true\false</param>
        /// <param name="expectedScoreNode"></param>
        private void ValidatePamsamAlign(string nodeName,
                                         string expectedScoreNode,
                                         UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
                                         DistanceFunctionTypes distanceFunctionName,
                                         ProfileAlignerNames profileAlignerName,
                                         ProfileScoreFunctionNames profileScoreName,
                                         bool useweights,
                                         bool fasterVersion,
                                         bool useStageB)
        {
            Initialize(nodeName, expectedScoreNode);

            // get old properties
            bool prevVersion = PAMSAMMultipleSequenceAligner.FasterVersion;
            bool prevUseWeights = PAMSAMMultipleSequenceAligner.UseWeights;
            bool prevUseStageB = PAMSAMMultipleSequenceAligner.UseStageB;

            try
            {
                // Set static properties
                PAMSAMMultipleSequenceAligner.FasterVersion = fasterVersion;
                PAMSAMMultipleSequenceAligner.UseWeights = useweights;
                PAMSAMMultipleSequenceAligner.UseStageB = useStageB;

                // MSA aligned sequences.
                int numberOfDegrees = 2;
                int numberOfPartitions = 2;
                var msa =
                    new PAMSAMMultipleSequenceAligner(lstSequences,
                                                      kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                      profileAlignerName, profileScoreName, similarityMatrix,
                                                      gapOpenPenalty,
                                                      gapExtendPenalty, numberOfDegrees, numberOfPartitions);

                // Validate the aligned Sequence and score
                if (fasterVersion)
                {
                    InitializeStage1Variables(nodeName);
                    Assert.AreEqual(stage1ExpectedSequences.Count, msa.AlignedSequences.Count);
                    int index = 0;
                    foreach (ISequence seq in msa.AlignedSequences)
                    {
                        Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()),
                                        new string(stage1ExpectedSequences[index].Select(a => (char) a).ToArray()));
                        index++;
                    }
                    Assert.IsTrue(stage1ExpectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null)));
                }
                else
                {
                    int index = 0;
                    foreach (ISequence seq in msa.AlignedSequences)
                    {
                        Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()),
                                        new string(expectedSequences[index].Select(a => (char) a).ToArray()));
                        index++;
                    }
                    Assert.AreEqual(expectedScore, msa.AlignmentScore.ToString((IFormatProvider) null));
                }
            }
            finally
            {
                // Reset it back
                PAMSAMMultipleSequenceAligner.FasterVersion = prevVersion;
                PAMSAMMultipleSequenceAligner.UseWeights = prevUseWeights;
                PAMSAMMultipleSequenceAligner.UseStageB = prevUseStageB;
            }

            ApplicationLog.WriteLine(
                String.Format(null, @"Validation of pamsam alignment completed 
                      successfully for molecule type {0} with 
                      static property fasterversion {0}, usestageb {1} and useweights {2}",
                              fasterVersion, useStageB, useweights));
        }
Beispiel #19
0
        /// <summary>
        ///     Validate Hierarchical Clustering for stage2 using kimura distance matrix and hierarchical method name
        /// </summary>
        /// <param name="nodeName">xml node name</param>
        /// <param name="hierarchicalMethodName"></param>
        private void ValidateHierarchicalClusteringStage2(string nodeName,
                                                          UpdateDistanceMethodsTypes hierarchicalMethodName)
        {
            Initialize(Constants.MuscleDnaSequenceNode, Constants.ExpectedScoreNode);
            List<ISequence> stage1AlignedSequences = GetStage1AlignedSequence();
            IDistanceMatrix matrix = GetKimuraDistanceMatrix(stage1AlignedSequences);

            IHierarchicalClustering hierarcicalClustering = GetHierarchicalClustering(matrix,
                                                                                      hierarchicalMethodName);

            ValidateHierarchicalClustering(nodeName, hierarcicalClustering.Nodes,
                                           hierarcicalClustering.Edges);

            ApplicationLog.WriteLine(String.Format(null,
                                                   @"PamsamBvtTest:: herarchical clustering stage2 nodes and edges generation and 
          validation completed success with different 
          hierarchical clustering method name {0}",
                                                   hierarchicalMethodName.ToString()));
        }
Beispiel #20
0
        /// <summary>
        ///     Validate Stage 3 aligned sequences and score of Muscle multiple sequence alignment.
        /// </summary>
        /// <param name="nodeName">xml node name</param>
        /// <param name="expectedScoreNode">Expected score node</param>
        /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param>
        /// <param name="distanceFunctionName">kmerdistancematrix method name.</param>
        /// <param name="profileAlignerName">SW/NW profiler</param>
        /// <param name="profileScoreName">Profile score function name.</param>
        private void ValidatePamsamAlignStage3(string nodeName,
                                               string expectedScoreNode,
                                               UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
                                               DistanceFunctionTypes distanceFunctionName,
                                               ProfileAlignerNames profileAlignerName,
                                               ProfileScoreFunctionNames profileScoreName)
        {
            Initialize(nodeName, expectedScoreNode);

            // MSA aligned sequences.
            var msa =
                new PAMSAMMultipleSequenceAligner(lstSequences,
                                                  kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                  profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty,
                                                  gapExtendPenalty, 2, 2);

            string expectedSeqString = expectedSequences.Aggregate(string.Empty,
                                                                   (current, seq) =>
                                                                   current +
                                                                   (new string(seq.Select(a => (char) a).ToArray()) +
                                                                    ","));

            foreach (ISequence seq in msa.AlignedSequencesC)
            {
                Assert.IsTrue(expectedSeqString.Contains(new string(seq.Select(a => (char) a).ToArray())));
            }

            Assert.IsTrue(expectedScore.Contains(msa.AlignmentScoreC.ToString((IFormatProvider) null)));
            ApplicationLog.WriteLine(String.Format(null,
                                                   "PamsamBvtTest:: Pamsam stage3 alignment completed successfully with all default params"));
        }
Beispiel #21
0
        /// <summary>
        ///     Validate Stage 2 aligned sequences and score of Muscle multiple sequence alignment.
        /// </summary>
        /// <param name="nodeName">xml node name</param>
        /// <param name="expectedScoreNode">Expected score node</param>
        /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param>
        /// <param name="distanceFunctionName">kmerdistancematrix method name.</param>
        /// <param name="profileAlignerName">SW/NW profiler</param>
        /// <param name="profileScoreName">Profile score function name.</param>
        private void ValidatePamsamAlignStage2(string nodeName,
                                               string expectedScoreNode,
                                               UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
                                               DistanceFunctionTypes distanceFunctionName,
                                               ProfileAlignerNames profileAlignerName,
                                               ProfileScoreFunctionNames profileScoreName)
        {
            Initialize(nodeName, expectedScoreNode);
            InitializeStage2Variables(nodeName);

            // MSA aligned sequences.
            var msa =
                new PAMSAMMultipleSequenceAligner(lstSequences,
                                                  kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                  profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty,
                                                  gapExtendPenalty, 2, 2);

            // Validate the aligned Sequence and score of stage2
            if (null != msa.AlignedSequencesB)
            {
                Assert.AreEqual(stage2ExpectedSequences.Count, msa.AlignedSequencesB.Count);
                int index = 0;
                foreach (ISequence seq in msa.AlignedSequencesB)
                {
                    Assert.AreEqual(new string(stage2ExpectedSequences[index].Select(a => (char) a).ToArray()),
                                    new string(seq.Select(a => (char) a).ToArray()));
                    index++;
                }
                Assert.AreEqual(stage2ExpectedScore, msa.AlignmentScoreB.ToString((IFormatProvider) null));
            }

            ApplicationLog.WriteLine(String.Format(null,
                                                   "PamsamBvtTest:: Pamsam stage2 alignment completed successfully with all default params"));
        }
        /// <summary>
        /// Construct an aligner and run the alignment.
        /// </summary>
        /// <param name="sequences">input sequences</param>
        /// <param name="kmerLength">positive integer of kmer length</param>
        /// <param name="distanceFunctionName">enum: distance function name</param>
        /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param>
        /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param>
        /// <param name="profileFunctionName">enum: profile-profile distance function</param>
        /// <param name="similarityMatrix">similarity matrix</param>
        /// <param name="gapOpenPenalty">negative gapOpenPenalty</param>
        /// <param name="gapExtendPenalty">negative gapExtendPenalty</param>
        /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param>
        /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param>
        public PAMSAMMultipleSequenceAligner(
                    IList<ISequence> sequences,
                    int kmerLength,
                    DistanceFunctionTypes distanceFunctionName,
                    UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
                    ProfileAlignerNames profileAlignerMethodName,
                    ProfileScoreFunctionNames profileFunctionName,
                    SimilarityMatrix similarityMatrix,
                    int gapOpenPenalty,
                    int gapExtendPenalty,
                    int numberOfPartitions,
                    int degreeOfParallelism)
        {
            AlignmentScoreC = float.MinValue;
            AlignmentScoreB = float.MinValue;
            AlignmentScoreA = float.MinValue;
            AlignmentScore = float.MinValue;
            StartLog();

            if (null == sequences)
            {
                throw new ArgumentNullException("sequences");
            }

            if (sequences.Count == 0)
            {
                throw new ArgumentException("Empty input sequences");
            }

            // Set parallel extension option
            if (degreeOfParallelism <= 0)
            {
                throw new ArgumentException("Invalid parallel degree parameter");
            }

            this.degreeOfParallelism = degreeOfParallelism;
            ParallelOption = new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism };

            if (numberOfPartitions <= 0)
            {
                throw new ArgumentException("Invalid number of partition parameter");
            }
            this.numberOfPartitions = numberOfPartitions;

            // Assign the alphabet
            SetAlphabet(sequences, similarityMatrix, false);

            // Initialize parameters
            KmerLength = kmerLength;
            DistanceFunctionName = distanceFunctionName;
            HierarchicalClusteringMethodName = hierarchicalClusteringMethodName;
            ProfileAlignerName = profileAlignerMethodName;
            ProfileProfileFunctionName = profileFunctionName;
            SimilarityMatrix = similarityMatrix;
            GapOpenCost = gapOpenPenalty;
            GapExtensionCost = gapExtendPenalty;

            MsaUtils.SetProfileItemSets(this.alphabet);

            ReportLog("Start Aligning");

            // Work...
            DoAlignment(sequences);
        }
Beispiel #23
0
 /// <summary>
 ///     Validate Muscle multiple sequence alignment with different profiler and hierarchical clustering method name.
 /// </summary>
 /// <param name="nodeName">xml node name.</param>
 /// <param name="moleculeType">molecule type of sequences</param>
 /// <param name="expectedScoreNode">Expected score node</param>
 /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param>
 /// <param name="profileName">SW/NW profiler</param>
 private void ValidatePamsamAlignWithUpdateDistanceMethodTypes(string nodeName,
                                                               MoleculeType moleculeType,
                                                               string expectedScoreNode,
                                                               UpdateDistanceMethodsTypes
                                                                   hierarchicalClusteringMethodName,
                                                               ProfileAlignerNames profileName)
 {
     ValidatePamsamAlign(nodeName, moleculeType, expectedScoreNode,
                         hierarchicalClusteringMethodName, DistanceFunctionTypes.EuclideanDistance,
                         profileName, ProfileScoreFunctionNames.WeightedInnerProduct, kmerLength,
                         false, false);
     ApplicationLog.WriteLine(
         String.Format(null,
                       "PamsamP1Test:: Pamsam alignment validation completed successfully for {0} moleculetype with different hierarchical clustering method name {1}",
                       moleculeType.ToString(),
                       hierarchicalClusteringMethodName.ToString()));
 }
Beispiel #24
0
        private void ValidatePamsamAlign(
            string nodeName, MoleculeType moleculeType, string expectedScoreNode,
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
            DistanceFunctionTypes distanceFunctionName,
            ProfileAlignerNames profileAlignerName,
            ProfileScoreFunctionNames profileScoreName, int kmrlength,
            bool addOnelineSequences, bool IsAlignForMoreSeq)
        {
            Initialize(nodeName, expectedScoreNode);
            if (addOnelineSequences)
            {
                AddOneLineSequences(nodeName);
            }

            // MSA aligned sequences.
            var msa = new PAMSAMMultipleSequenceAligner(lstSequences,
                                                        kmrlength, distanceFunctionName,
                                                        hierarchicalClusteringMethodName,
                                                        profileAlignerName, profileScoreName, similarityMatrix,
                                                        gapOpenPenalty,
                                                        gapExtendPenalty, 2, 2);

            // Validate the aligned Sequence and score
            int index = 0;
            foreach (ISequence seq in msa.AlignedSequences)
            {
                if (IsAlignForMoreSeq)
                {
                    Assert.IsTrue(expectedSequences.Contains(seq));
                    index++;
                }
            }

            Assert.IsTrue(expectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null)));
        }
Beispiel #25
0
        public void TestMuscleMultipleSequenceAlignmentRunningTime()
        {
            // Test on DNA benchmark dataset
            ISequenceParser parser = new FastaParser();
            //string filepath = @"testdata\FASTA\RunningTime\122.afa";
            string filepath = @"testdata\FASTA\RunningTime\BOX246.xml.afa";

            MoleculeType mt = MoleculeType.Protein;

            IList <ISequence> orgSequences = parser.Parse(filepath);

            List <ISequence> sequences = MsaUtils.UnAlign(orgSequences);

            //filepath = @"testdata\FASTA\RunningTime\12_raw.afa";
            //List<ISequence> sequences = parser.Parse(filepath);

            int numberOfSequences = orgSequences.Count;

            Console.WriteLine("Original sequences are:");
            for (int i = 0; i < numberOfSequences; ++i)
            {
                Console.WriteLine(sequences[i].ToString());
            }

            Console.WriteLine("Benchmark sequences are:");
            for (int i = 0; i < numberOfSequences; ++i)
            {
                Console.WriteLine(orgSequences[i].ToString());
            }

            PAMSAMMultipleSequenceAligner.FasterVersion = true;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = false;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            int gapOpenPenalty   = -13;
            int gapExtendPenalty = -5;
            int kmerLength       = 2;

            int numberOfDegrees    = 2;  //Environment.ProcessorCount;
            int numberOfPartitions = 16; // Environment.ProcessorCount * 2;


            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast;

            SimilarityMatrix similarityMatrix = null;

            switch (mt)
            {
            case (MoleculeType.DNA):
                similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna);
                break;

            case (MoleculeType.RNA):
                similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna);
                break;

            case (MoleculeType.Protein):
                similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62);
                break;

            default:
                throw new Exception("Invalid molecular type");
            }

            PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                    (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                    profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                    numberOfPartitions, numberOfDegrees);

            Console.WriteLine("The number of partitions is: {0}", numberOfPartitions);
            Console.WriteLine("The number of degrees is: {0}", numberOfDegrees);
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences));



            Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty));
            Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA);
            for (int i = 0; i < msa.AlignedSequencesA.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequencesA[i].ToString());
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences));
            Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB);
            for (int i = 0; i < msa.AlignedSequencesB.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequencesB[i].ToString());
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences));
            Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC);
            for (int i = 0; i < msa.AlignedSequencesC.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequencesC[i].ToString());
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences));
            Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);
            for (int i = 0; i < msa.AlignedSequences.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequences[i].ToString());
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences));
        }
Beispiel #26
0
        /// <summary>
        /// Construct an aligner and run the alignment.
        /// </summary>
        /// <param name="sequences">input sequences</param>
        /// <param name="kmerLength">positive integer of kmer length</param>
        /// <param name="distanceFunctionName">enum: distance function name</param>
        /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param>
        /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param>
        /// <param name="profileFunctionName">enum: profile-profile distance function</param>
        /// <param name="similarityMatrix">similarity matrix</param>
        /// <param name="gapOpenPenalty">negative gapOpenPenalty</param>
        /// <param name="gapExtendPenalty">negative gapExtendPenalty</param>
        /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param>
        /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param>
        public PAMSAMMultipleSequenceAligner(
            IList <ISequence> sequences,
            int kmerLength,
            DistanceFunctionTypes distanceFunctionName,
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName,
            ProfileAlignerNames profileAlignerMethodName,
            ProfileScoreFunctionNames profileFunctionName,
            SimilarityMatrix similarityMatrix,
            int gapOpenPenalty,
            int gapExtendPenalty,
            int numberOfPartitions,
            int degreeOfParallelism)
        {
            Performance.Start();

            if (null == sequences)
            {
                throw new ArgumentNullException("sequences");
            }

            if (sequences.Count == 0)
            {
                throw new ArgumentException("Empty input sequences");
            }

            // Set parallel extension option
            if (degreeOfParallelism <= 0)
            {
                throw new ArgumentException("Invalid parallel degree parameter");
            }

            //_degreeOfParallelism = degreeOfParallelism;
            parallelOption = new ParallelOptions {
                MaxDegreeOfParallelism = degreeOfParallelism
            };

            if (numberOfPartitions <= 0)
            {
                throw new ArgumentException("Invalid number of partition parameter");
            }
            _numberOfPartitions = numberOfPartitions;

            // Assign the alphabet
            SetAlphabet(sequences, similarityMatrix, false);

            // Initialize parameters
            KmerLength                       = kmerLength;
            DistanceFunctionName             = distanceFunctionName;
            HierarchicalClusteringMethodName = hierarchicalClusteringMethodName;
            ProfileAlignerName               = profileAlignerMethodName;
            ProfileProfileFunctionName       = profileFunctionName;
            SimilarityMatrix                 = similarityMatrix;
            GapOpenCost                      = gapOpenPenalty;
            GapExtensionCost                 = gapExtendPenalty;

            MsaUtils.SetProfileItemSets(_alphabet);

            Performance.Snapshot("Start Aligning");

            // Work...
            DoAlignment(sequences);
        }
Beispiel #27
0
        /// <summary>
        /// Construct clusters using different update methods
        /// </summary>
        /// <param name="distanceMatrix">IDistanceMatrix</param>
        /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param>
        public HierarchicalClustering(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName)
        {
            if (distanceMatrix.Dimension <= 0)
            {
                throw new Exception("Invalid distance matrix dimension");
            }

            try
            {
                // The number of nodes in the final tree is 2N-2:
                // N sequence nodes (leaves) and N-2 internal nodes
                // where N is the number of input sequences
                _nodes = new List<BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 1);
                _edges = new List<BinaryGuideTreeEdge>(distanceMatrix.Dimension * 2 - 2);

                // The number of clusters is the number of leaves at the beginning
                // As the algorithm merges clusters, only one cluster remains.
                _clusters = new List<int>(distanceMatrix.Dimension);
                
                // Construct _indexToCluster
                _indexToCluster = new int[distanceMatrix.Dimension];
                for (int i = 0; i < distanceMatrix.Dimension; ++i)
                {
                    _indexToCluster[i] = i;
                }
            }
            catch (OutOfMemoryException ex)
            {
                throw new Exception("Out of memory", ex.InnerException);
            }

            // Choose a update-distance method
            switch(updateDistanceMethodName)
            {
                case(UpdateDistanceMethodsTypes.Average):
                    _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage);
                    break;
                case(UpdateDistanceMethodsTypes.Single):
                    _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle);
                    break;
                case(UpdateDistanceMethodsTypes.Complete):
                    _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete);
                    break;
                case(UpdateDistanceMethodsTypes.WeightedMAFFT):
                    _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT);
                    break;
                default:
                    throw new Exception("invalid update method");
            }
        }
Beispiel #28
0
        public void TestMsaBenchMarkLargeDataset()
        {
            // Test on DNA benchmark dataset
            ISequenceParser   parser       = new FastaParser();
            string            filepath     = @"testdata\FASTA\Protein\Balibase\RV913\BOX032.xml.afa";
            IList <ISequence> orgSequences = parser.Parse(filepath);

            IList <ISequence> sequences = MsaUtils.UnAlign(orgSequences);
            int numberOfSequences       = orgSequences.Count;

            String outputFilePath = @"tempBOX032.xml.afa";

            StreamWriter writer = new StreamWriter(outputFilePath, true);

            foreach (ISequence sequence in sequences)
            {
                writer.WriteLine(">" + sequence.ID);
                // write sequence
                BasicDerivedSequence derivedSeq = new BasicDerivedSequence(sequence, false, false, 0, 0);
                for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60)
                {
                    derivedSeq.RangeStart  = lineStart;
                    derivedSeq.RangeLength = Math.Min(60, sequence.Count - lineStart);
                    writer.WriteLine(derivedSeq.ToString());
                }
                writer.Flush();
            }
            writer.Close();

            sequences.Clear();
            sequences = parser.Parse(outputFilePath);

            Console.WriteLine("Original sequences are:");
            for (int i = 0; i < numberOfSequences; ++i)
            {
                Console.WriteLine(sequences[i].ToString());
            }

            Console.WriteLine("Benchmark sequences are:");
            for (int i = 0; i < numberOfSequences; ++i)
            {
                Console.WriteLine(orgSequences[i].ToString());
            }

            PAMSAMMultipleSequenceAligner.FasterVersion = false;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = true;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;
            int gapOpenPenalty   = -13;
            int gapExtendPenalty = -5;
            int kmerLength       = 3;

            int numberOfDegrees    = 2;  //Environment.ProcessorCount;
            int numberOfPartitions = 16; // Environment.ProcessorCount * 2;

            SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62);

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct;

            PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                    (sequences, MoleculeType.Protein, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                    profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                    numberOfPartitions, numberOfDegrees);

            Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty));
            Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA);
            for (int i = 0; i < msa.AlignedSequencesA.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequencesA[i].ToString());
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences));
            Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB);
            for (int i = 0; i < msa.AlignedSequencesB.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequencesB[i].ToString());
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences));
            Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC);
            for (int i = 0; i < msa.AlignedSequencesC.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequencesC[i].ToString());
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences));
            Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);

            for (int i = 0; i < msa.AlignedSequences.Count; ++i)
            {
                Console.WriteLine(msa.AlignedSequences[i].ToString());
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences));

            if (File.Exists(outputFilePath))
            {
                File.Delete(outputFilePath);
            }
        }
Beispiel #29
0
        /// <summary>
        ///     Validate Hierarchical Clustering for stage2 using kimura distance matrix
        ///     and hierarchical method name
        /// </summary>
        /// <param name="nodeName">xml node name</param>
        /// <param name="moleculeType">molecule type of sequences</param>
        /// <param name="hierarchicalMethodName">hierarchical method name</param>
        private void ValidateHierarchicalClusteringStage2(string nodeName, MoleculeType moleculeType,
                                                          UpdateDistanceMethodsTypes hierarchicalMethodName)
        {
            switch (moleculeType)
            {
                case MoleculeType.DNA:
                    Initialize(Constants.MuscleDnaSequenceNode, Constants.ExpectedScoreNode);
                    break;
                case MoleculeType.Protein:
                    Initialize(Constants.MuscleProteinSequenceNode, Constants.ExpectedScoreNode);
                    break;
                case MoleculeType.RNA:
                    Initialize(Constants.MuscleRnaSequenceNode, Constants.ExpectedScoreNode);
                    break;
                default:
                    break;
            }

            List<ISequence> stage1AlignedSequences = GetStage1AlignedSequence(moleculeType);

            // Get kimura distance matrix
            IDistanceMatrix matrix = GetKimuraDistanceMatrix(stage1AlignedSequences);

            // Get hierarchical clustering using method name
            IHierarchicalClustering hierarcicalClustering = GetHierarchicalClustering(matrix,
                                                                                      hierarchicalMethodName);

            ValidateHierarchicalClustering(nodeName, hierarcicalClustering.Nodes,
                                           hierarcicalClustering.Edges);

            ApplicationLog.WriteLine(String.Format(null,
                                                   @"PamsamP1Test:: hierarchical clustering stage2 nodes and edges generation and 
                    validation completed success for {0} moleculetype with different 
                    hierarchical clustering method name {1}",
                                                   moleculeType.ToString(),
                                                   hierarchicalMethodName.ToString()));
        }
Beispiel #30
0
        public void TestMuscleMultipleSequenceAlignmentRunningTime()
        {
            string filepath = @"TestUtils\FASTA\RunningTime\BOX246.xml.afa";

            // Test on DNA benchmark dataset
            FastAParser parser = new FastAParser(filepath);

            IList <ISequence> orgSequences = parser.Parse().ToList();

            List <ISequence> sequences = MsaUtils.UnAlign(orgSequences);

            //filepath = @"TestUtils\FASTA\RunningTime\12_raw.afa";
            //List<ISequence> sequences = parser.Parse(filepath);

            int numberOfSequences = orgSequences.Count;

            Console.WriteLine("Original sequences are:");
            for (int i = 0; i < numberOfSequences; ++i)
            {
                Console.WriteLine(new string(sequences[i].Select(a => (char)a).ToArray()));
            }

            Console.WriteLine("Benchmark sequences are:");
            for (int i = 0; i < numberOfSequences; ++i)
            {
                Console.WriteLine(new string(orgSequences[i].Select(a => (char)a).ToArray()));
            }

            PAMSAMMultipleSequenceAligner.FasterVersion = true;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = false;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            int gapOpenPenalty   = -13;
            int gapExtendPenalty = -5;
            int kmerLength       = 2;

            int numberOfDegrees    = 2;  //Environment.ProcessorCount;
            int numberOfPartitions = 16; // Environment.ProcessorCount * 2;


            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast;

            SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62);

            PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                    (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                    profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                    numberOfPartitions, numberOfDegrees);

            Console.WriteLine("The number of partitions is: {0}", numberOfPartitions);
            Console.WriteLine("The number of degrees is: {0}", numberOfDegrees);
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences));



            Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty));
            Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA);
            for (int i = 0; i < msa.AlignedSequencesA.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences));
            Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB);
            for (int i = 0; i < msa.AlignedSequencesB.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesB[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences));
            Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC);
            for (int i = 0; i < msa.AlignedSequencesC.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences));
            Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);
            for (int i = 0; i < msa.AlignedSequences.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences));
            ((FastAParser)parser).Dispose();
        }
Beispiel #31
0
        public void TestMsaBenchMarkLargeDataset()
        {
            string filepath    = @"\TestUtils\BOX032Small.xml.afa";
            string filePathObj = Directory.GetCurrentDirectory() + filepath;
            // Test on DNA benchmark dataset
            FastAParser       parser       = new FastAParser(filePathObj);
            IList <ISequence> orgSequences = parser.Parse().ToList();

            IList <ISequence> sequences = MsaUtils.UnAlign(orgSequences);
            int numberOfSequences       = orgSequences.Count;

            String outputFilePath = @"tempBOX032.xml.afa";

            using (StreamWriter writer = new StreamWriter(outputFilePath, true))
            {
                foreach (ISequence sequence in sequences)
                {
                    writer.WriteLine(">" + sequence.ID);
                    // write sequence
                    for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60)
                    {
                        writer.WriteLine(new String(sequence.Skip(lineStart).Take((int)Math.Min(60, sequence.Count - lineStart)).Select(a => (char)a).ToArray()));
                    }
                    writer.Flush();
                }
            }

            sequences.Clear();
            parser    = new FastAParser(outputFilePath);
            sequences = parser.Parse().ToList();

            Console.WriteLine("Original sequences are:");
            for (int i = 0; i < numberOfSequences; ++i)
            {
                Console.WriteLine(new string(sequences[i].Select(a => (char)a).ToArray()));
            }

            Console.WriteLine("Benchmark sequences are:");
            for (int i = 0; i < numberOfSequences; ++i)
            {
                Console.WriteLine(new string(orgSequences[i].Select(a => (char)a).ToArray()));
            }

            PAMSAMMultipleSequenceAligner.FasterVersion = false;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = true;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;
            int gapOpenPenalty   = -13;
            int gapExtendPenalty = -5;
            int kmerLength       = 3;

            int numberOfDegrees    = 2;  //Environment.ProcessorCount;
            int numberOfPartitions = 16; // Environment.ProcessorCount * 2;

            SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62);

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct;

            PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                    (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                    profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                    numberOfPartitions, numberOfDegrees);

            Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty));
            Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA);
            for (int i = 0; i < msa.AlignedSequencesA.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences));
            Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB);
            for (int i = 0; i < msa.AlignedSequencesB.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesB[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences));
            Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC);
            for (int i = 0; i < msa.AlignedSequencesC.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences));
            Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);

            for (int i = 0; i < msa.AlignedSequences.Count; ++i)
            {
                Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray()));
            }
            Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences));
            Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences));

            ((FastAParser)parser).Dispose();

            if (File.Exists(outputFilePath))
            {
                File.Delete(outputFilePath);
            }
        }
Beispiel #32
0
        public void TestMsaBenchMarkOnBralibase()
        {
            var allQ  = new List <float>();
            var allTC = new List <float>();

            string        fileDirectory = @"TestUtils\Fasta\RNA\k10".TestDir();
            DirectoryInfo iD            = new DirectoryInfo(fileDirectory);

            PAMSAMMultipleSequenceAligner.FasterVersion = false;
            PAMSAMMultipleSequenceAligner.UseWeights    = false;
            PAMSAMMultipleSequenceAligner.UseStageB     = false;
            PAMSAMMultipleSequenceAligner.NumberOfCores = 2;

            var similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna);;
            int gapOpenPenalty   = -20;
            int gapExtendPenalty = -5;
            int kmerLength       = 4;

            int numberOfDegrees    = 2;
            int numberOfPartitions = 16;

            DistanceFunctionTypes      distanceFunctionName             = DistanceFunctionTypes.EuclideanDistance;
            UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average;
            ProfileAlignerNames        profileAlignerName         = ProfileAlignerNames.NeedlemanWunschProfileAligner;
            ProfileScoreFunctionNames  profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached;

            foreach (DirectoryInfo fi in iD.GetDirectories())
            {
                foreach (FileInfo fiii in fi.GetFiles())
                {
                    String filePath = fiii.FullName;
                    Console.WriteLine($"Loading: {filePath}");

                    var orgSequences = new FastAParser()
                    {
                        Alphabet = AmbiguousRnaAlphabet.Instance
                    }.Parse(filePath).ToList();
                    var sequences = MsaUtils.UnAlign(orgSequences);

                    int numberOfSequences = orgSequences.Count;
                    Console.WriteLine("The number of sequences is: {0}", numberOfSequences);

                    PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner
                                                            (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName,
                                                            profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty,
                                                            numberOfPartitions, numberOfDegrees);

                    Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore);

                    float scoreQ  = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences);
                    float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences);
                    Console.WriteLine("Alignment score Q is: {0}", scoreQ);
                    Console.WriteLine("Alignment score TC is: {0}", scoreTC);

                    allQ.Add(scoreQ);
                    allTC.Add(scoreTC);

                    if (allQ.Count % 1000 == 0)
                    {
                        Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
                        Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray()));
                        Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray()));
                    }
                }
            }
            Console.WriteLine("number of datasets is: {0}", allQ.Count);
            Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray()));
            Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray()));
        }
Beispiel #33
0
        /// <summary>
        ///     Validate Muscle multiple sequence alignment with different profiler and hierarchical clustering method name.
        /// </summary>
        /// <param name="nodeName">xml node name.</param>
        /// <param name="expectedScoreNode">Expected score node</param>
        /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param>
        /// <param name="profileName">SW/NW profiler</param>
        /// <param name="isWeightedProduct">True if it of the WeightedProduct type else false.</param>
        private void ValidatePamsamAlignWithUpdateDistanceMethodTypes(string nodeName,
                                                                      string expectedScoreNode,
                                                                      UpdateDistanceMethodsTypes
                                                                          hierarchicalClusteringMethodName,
                                                                      ProfileAlignerNames profileName,
                                                                      bool isWeightedProduct)
        {
            ValidatePamsamAlign(nodeName, expectedScoreNode, hierarchicalClusteringMethodName,
                                DistanceFunctionTypes.EuclideanDistance, profileName,
                                ProfileScoreFunctionNames.InnerProduct,
                                isWeightedProduct);

            ApplicationLog.WriteLine(String.Format(null,
                                                   @"PamsamBvtTest:: Pamsam alignment validation completed successfully with different hierarchical clustering method name {0}",
                                                   hierarchicalClusteringMethodName.ToString()));
        }