public void testBug2() { //Test on DNA benchmark dataset string filepath = @"TestUtils\122_raw.afa".TestDir(); FastAParser parser = new FastAParser(); IList <ISequence> orgSequences = parser.Parse(filepath).ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Assert.IsNotNull(msa.AlignedSequences); }
public void testBug3() { //Test on DNA benchmark dataset ISequenceParser parser = new FastaParser(); string filepath = @"TestUtils\122_raw.afa"; MoleculeType mt = MoleculeType.DNA; IList <ISequence> orgSequences = parser.Parse(filepath); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = null; switch (mt) { case (MoleculeType.DNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); break; case (MoleculeType.RNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); break; case (MoleculeType.Protein): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); break; default: throw new InvalidDataException("Invalid molecular type"); } //DateTime startTime = DateTime.Now; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Assert.IsNotNull(msa.AlignedSequences); ((FastaParser)parser).Dispose(); }
/// <summary> /// Construct clusters using different update methods /// </summary> /// <param name="distanceMatrix">IDistanceMatrix</param> /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param> public HierarchicalClustering(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName) { if (distanceMatrix.Dimension <= 0) { throw new Exception("Invalid distance matrix dimension"); } try { // The number of nodes in the final tree is 2N-2: // N sequence nodes (leaves) and N-2 internal nodes // where N is the number of input sequences _nodes = new List <BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 1); _edges = new List <BinaryGuideTreeEdge>(distanceMatrix.Dimension * 2 - 2); // The number of clusters is the number of leaves at the beginning // As the algorithm merges clusters, only one cluster remains. _clusters = new List <int>(distanceMatrix.Dimension); // Construct _indexToCluster _indexToCluster = new int[distanceMatrix.Dimension]; for (int i = 0; i < distanceMatrix.Dimension; ++i) { _indexToCluster[i] = i; } } catch (OutOfMemoryException ex) { throw new Exception("Out of memory", ex.InnerException); } // Choose a update-distance method switch (updateDistanceMethodName) { case (UpdateDistanceMethodsTypes.Average): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage); break; case (UpdateDistanceMethodsTypes.Single): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle); break; case (UpdateDistanceMethodsTypes.Complete): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete); break; case (UpdateDistanceMethodsTypes.WeightedMAFFT): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT); break; default: throw new Exception("invalid update method"); } }
/// <summary> /// Construct clusters using different update methods /// </summary> /// <param name="distanceMatrix">IDistanceMatrix</param> /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param> public HierarchicalClusteringParallel(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName) { if (distanceMatrix.Dimension <= 0) { throw new Exception("Invalid distance matrix dimension"); } // The number of nodes in the final tree is 2N-2: // N sequence nodes (leaves) and N-2 internal nodes // where N is the number of input sequences _nodes = new List <BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 2); _edges = new List <BinaryGuideTreeEdge>(); // The number of clusters is the number of leaves at the beginning // As the algorithm merges clusters, only one cluster remains. _clusters = new List <int>(distanceMatrix.Dimension); // Choose a update-distance method switch (updateDistanceMethodName) { case (UpdateDistanceMethodsTypes.Aaverage): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage); break; case (UpdateDistanceMethodsTypes.Single): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle); break; case (UpdateDistanceMethodsTypes.Complete): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete); break; case (UpdateDistanceMethodsTypes.WeightedMAFFT): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT); break; default: throw new Exception("invalid update method"); } // Initialize the clusters Initialize(distanceMatrix); // Clustering... while (_numberOfClusters > 1) { GetNextPairOfCluster(distanceMatrix); CreateCluster(); UpdateDistance(distanceMatrix); UpdateClusters(); } }
public void TestMuscleMultipleSequenceAlignment() { ISequence templateSequence = new Sequence(Alphabets.DNA, "ATGCSWRYKMBVHDN-"); Dictionary <ISequenceItem, int> itemSet = new Dictionary <ISequenceItem, int>(); for (int i = 0; i < templateSequence.Count; ++i) { itemSet.Add(templateSequence[i], i); } Profiles.ItemSet = itemSet; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrices.AmbiguousDna); int gapOpenPenalty = -8; int gapExtendPenalty = -1; int kmerLength = 3; ISequence seqA = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"); ISequence seqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"); ISequence seqC = new Sequence(Alphabets.DNA, "GGGACAAAATCAG"); List <ISequence> sequences = new List <ISequence>(); sequences.Add(seqA); sequences.Add(seqB); sequences.Add(seqC); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclieanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Aaverage; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; MuscleMultipleSequenceAlignment msa = new MuscleMultipleSequenceAlignment (sequences, MoleculeType.DNA, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty); ISequence expectedSeqA = new Sequence(Alphabets.DNA, "GGGA---AAAATCAGATT"); ISequence expectedSeqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG---"); ISequence expectedSeqC = new Sequence(Alphabets.DNA, "GGGA--CAAAATCAG---"); Assert.AreEqual(expectedSeqA.ToString(), msa.AlignedSequences[0].ToString()); Assert.AreEqual(expectedSeqB.ToString(), msa.AlignedSequences[1].ToString()); Assert.AreEqual(expectedSeqC.ToString(), msa.AlignedSequences[2].ToString()); Assert.AreEqual(46, msa.AlignmentScore); }
/// <summary> /// Construct clusters using different update methods /// </summary> /// <param name="distanceMatrix">IDistanceMatrix</param> /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param> public HierarchicalClusteringParallel(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName) : base(distanceMatrix, updateDistanceMethodName) { // Initialize the clusters Initialize(distanceMatrix); // Clustering... while (_numberOfClusters > 1) { try { GetNextPairOfCluster(distanceMatrix); CreateCluster(distanceMatrix); UpdateClusters(); UpdateDistance(distanceMatrix); } catch (OutOfMemoryException ex) { throw new Exception("Our of memory", ex.InnerException); } } }
// $TODO: Change the above namespace after PhaseOne changes /// <summary> /// Aligns multiple sequences using a multiple sequence aligner. /// This sample uses PAMSAM with a set of default parameters. /// </summary> /// <param name="sequences">List of sequences to align.</param> /// <returns>List of ISequenceAlignment</returns> static IList <ISequence> DoMultipleSequenceAlignment(List <ISequence> sequences) { // $TODO: Change the signature after PAMSAM PhaseOne is checked in // Initialise objects for constructor // $TODO: Change this after PAMSAM PhaseOne is checked in SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); int gapOpenPenalty = -4; int gapExtendPenalty = -1; int kmerLength = 3; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; // Call aligner PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount * 2, Environment.ProcessorCount); return(msa.AlignedSequences); }
/// <summary> /// Validate Muscle multiple sequence alignment with gap open cost and penalty. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="moleculeType">molecule type</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName">SW/NW profiler</param> /// <param name="profileScoreName">Profile score function name.</param> /// <param name="gpOpenPenalty">Gap open penalty</param> /// <param name="gpExtendPenalty">Gap extended penalty</param> /// <param name="IsAlignedLargeSeq">True for large sequence else false</param> private void ValidatePamsamAlignWithGapCost( string nodeName, MoleculeType moleculeType, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName, int gpOpenPenalty, int gpExtendPenalty, bool IsAlignedLargeSeq) { Initialize(nodeName, expectedScoreNode); // MSA aligned sequences with sepcified gap costs. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gpOpenPenalty, gpExtendPenalty, 2, 2); // Validate the aligned Sequence and score int index = 0; foreach (ISequence seq in msa.AlignedSequences) { if (IsAlignedLargeSeq) { Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()), new string(expectedSequences[index].Select(a => (char) a).ToArray())); index++; } } Assert.IsTrue(expectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null))); ApplicationLog.WriteLine(String.Format(null, "PamsamP1Test:: Pamsam alignment completed successfully with equal gap cost for {0} moleculetype with all default params", moleculeType.ToString())); }
/// <summary> /// Validate Muscle multiple sequence alignment. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName">SW/NW profiler</param> /// <param name="profileScoreName">Profile score function name.</param> /// <param name="isWeightedProduct">True if it of the WeightedProduct type else false.</param> private void ValidatePamsamAlign(string nodeName, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName, bool isWeightedProduct) { Initialize(nodeName, expectedScoreNode); // MSA aligned sequences. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, 2, 2); int index = 0; foreach (ISequence seq in msa.AlignedSequences) { if (isWeightedProduct) { Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()), new string(expectedSequences[index].Select(a => (char) a).ToArray())); index++; } } Assert.IsTrue(expectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null))); }
public void TestMsaBenchMark() { string fileDirectory = @"TestUtils\FASTA\Protein\Balibase\RV911\"; DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; SimilarityMatrix similarityMatrix; int gapOpenPenalty = -20; int gapExtendPenalty = -5; int kmerLength = 4; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached; similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); List <float> allQ = new List <float>(); List <float> allTC = new List <float>(); foreach (FileInfo fi in iD.GetFiles()) { String filePath = fi.FullName; Console.WriteLine(filePath); FastAParser parser = new FastAParser(filePath); parser.Alphabet = AmbiguousProteinAlphabet.Instance; IList <ISequence> orgSequences = parser.Parse().ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); Console.WriteLine("Original unaligned sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { //Console.WriteLine(sequences[i].ToString()); } Console.WriteLine("Original aligned sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { //Console.WriteLine(orgSequences[i].ToString()); } PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { //Console.WriteLine(msa.AlignedSequencesA[i].ToString()); } Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { //Console.WriteLine(msa.AlignedSequencesB[i].ToString()); } Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { //Console.WriteLine(msa.AlignedSequencesC[i].ToString()); } Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { //Console.WriteLine(msa.AlignedSequences[i].ToString()); } float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); allQ.Add(scoreQ); allTC.Add(scoreTC); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); ((FastAParser)parser).Dispose(); } Console.WriteLine("Number of datasets is: {0}", allQ.Count); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMsaBenchMarkOnBralibase() { List <float> allQ = new List <float>(); List <float> allTC = new List <float>(); string fileDirectory = @"testData\FASTA\RNA\k10"; DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; MoleculeType mt = MoleculeType.RNA; SimilarityMatrix similarityMatrix; int gapOpenPenalty = -20; int gapExtendPenalty = -5; int kmerLength = 4; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached; switch (mt) { case (MoleculeType.DNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); break; case (MoleculeType.RNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); break; case (MoleculeType.Protein): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); break; default: throw new Exception("Invalid molecular type"); } foreach (DirectoryInfo fi in iD.GetDirectories()) { foreach (FileInfo fiii in fi.GetFiles()) { String filePath = fiii.FullName; Console.WriteLine(filePath); ISequenceParser parser = new FastaParser(); IList <ISequence> orgSequences = parser.Parse(filePath); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); Console.WriteLine("Original unaligned sequences are:"); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { //Console.WriteLine(msa.AlignedSequences[i].ToString()); } float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); allQ.Add(scoreQ); allTC.Add(scoreTC); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); if (allQ.Count % 1000 == 0) { Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); } } } Console.WriteLine("number of datasets is: {0}", allQ.Count); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMuscleMultipleSequenceAlignment() { SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); int gapOpenPenalty = -4; int gapExtendPenalty = -1; int kmerLength = 3; ISequence seqA = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"); ISequence seqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"); ISequence seqC = new Sequence(Alphabets.DNA, "GGGACAAAATCAG"); List <ISequence> sequences = new List <ISequence>(); sequences.Add(seqA); sequences.Add(seqB); sequences.Add(seqC); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount * 2, Environment.ProcessorCount); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } // Test case 2 Console.WriteLine("Example 2"); sequences = new List <ISequence>(); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAATCG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCTTATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount * 2, Environment.ProcessorCount); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } // Test case e Console.WriteLine("Example 2"); sequences = new List <ISequence>(); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAATCG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCTTATCAG")); msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount * 2, Environment.ProcessorCount); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } }
public void PerformPAMSAMPerf() { Stopwatch _watchObj = new Stopwatch(); // Get input values from XML. string refPath = Utility._xmlUtil.GetTextValue(Constants.PamsamNode, Constants.RefFilePathNode); string queryPath = Utility._xmlUtil.GetTextValue(Constants.PamsamNode, Constants.QueryFilePathNode); // Create a List for input files. List <string> lstInputFiles = new List <string>(); lstInputFiles.Add(refPath); lstInputFiles.Add(queryPath); // Parse a Reference and query sequence file. ISequenceParser parser = new FastaParser(); IList <ISequence> refsequences = parser.Parse(queryPath); IList <ISequence> orgSequences = parser.Parse(refPath); // Execute UnAlign method to verify that it does not contains gap List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); // Set static properties PAMSAMMultipleSequenceAligner.FasterVersion = true; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; // Set Alignment parameters. int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; int numberOfPartitions = 4; // Profile Distance function name DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; // Set Hierarchical clustering. UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; // Set NeedlemanWunschProfileAligner ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProduct; // Create similarity matrix instance. SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); // Reset stop watch and start timer. _watchObj.Reset(); _watchObj.Start(); long memoryStart = GC.GetTotalMemory(true); // Parallel Option will only get set if the PAMSAMMultipleSequenceAligner is getting called // To test separately distance matrix, binary tree etc.. // Set the parallel option using below ctor. msa = new PAMSAMMultipleSequenceAligner (sequences, MoleculeType.DNA, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); // Stop watchclock. _watchObj.Stop(); long memoryEnd = GC.GetTotalMemory(true); string memoryUsed = (memoryEnd - memoryStart).ToString(); // Display all aligned sequence, performance and memory optimization nos. DisplayTestCaseHeader(lstInputFiles, _watchObj, memoryUsed, "PAMSAM"); Console.WriteLine(string.Format( "PAMSAM SequenceAligner method, Alignment Score is : {0}", msa.AlignmentScore.ToString())); int index = 0; foreach (ISequence seq in msa.AlignedSequences) { Console.WriteLine(string.Format( "PAMSAM Aligned Seq {0}:{1}", index, seq.ToString())); index++; } }
/// <summary> /// Construct an aligner /// </summary> /// <param name="sequences">input sequences</param> /// <param name="kmerLength">positive integer of kmer length</param> /// <param name="distanceFunctionName">enum: distance function name</param> /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param> /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param> /// <param name="profileFunctionName">enum: profile-profile distance function</param> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="gapOpenPenalty">negative gapOpenPenalty</param> /// <param name="gapExtendPenalty">negative gapExtendPenalty</param> /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param> /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param> public PAMSAMMultipleSequenceAligner( IList <ISequence> sequences, int kmerLength, DistanceFunctionTypes distanceFunctionName, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, ProfileAlignerNames profileAlignerMethodName, ProfileScoreFunctionNames profileFunctionName, SimilarityMatrix similarityMatrix, int gapOpenPenalty, int gapExtendPenalty, int numberOfPartitions, int degreeOfParallelism) { Performance.Start(); if (null == sequences) { throw new ArgumentNullException("sequences"); } if (sequences.Count == 0) { throw new ArgumentException("Empty input sequences"); } // Set parallel extension option if (degreeOfParallelism <= 0) { throw new ArgumentException("Invalid parallel degree parameter"); } PAMSAMMultipleSequenceAligner.parallelOption = new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism }; if (numberOfPartitions <= 0) { throw new ArgumentException("Invalid number of partition parameter"); } _numberOfPartitions = numberOfPartitions; // Validate data type _alphabet = sequences[0].Alphabet; Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, i => { if (!Alphabets.CheckIsFromSameBase(sequences[i].Alphabet, _alphabet)) { throw new ArgumentException("Inconsistent sequence alphabet"); } }); List <String> similarityMatrixDNA = new List <String>(); similarityMatrixDNA.Add("AmbiguousDNA"); List <String> similarityMatrixRNA = new List <String>(); similarityMatrixRNA.Add("AmbiguousRNA"); List <String> similarityMatrixProtein = new List <String>(); similarityMatrixProtein.Add("BLOSUM45"); similarityMatrixProtein.Add("BLOSUM50"); similarityMatrixProtein.Add("BLOSUM62"); similarityMatrixProtein.Add("BLOSUM80"); similarityMatrixProtein.Add("BLOSUM90"); similarityMatrixProtein.Add("PAM250"); similarityMatrixProtein.Add("PAM30"); similarityMatrixProtein.Add("PAM70"); if (_alphabet is DnaAlphabet) { if (!similarityMatrixDNA.Contains(similarityMatrix.Name)) { throw new ArgumentException("Inconsistent similarity matrix"); } } else if (_alphabet is ProteinAlphabet) { if (!similarityMatrixProtein.Contains(similarityMatrix.Name)) { throw new ArgumentException("Inconsistent similarity matrix"); } } else if (_alphabet is RnaAlphabet) { if (!similarityMatrixRNA.Contains(similarityMatrix.Name)) { throw new ArgumentException("Inconsistent similarity matrix"); } } else { throw new ArgumentException("Invalid alphabet"); } // Initialize parameters _kmerLength = kmerLength; _distanceFunctionName = distanceFunctionName; _hierarchicalClusteringMethodName = hierarchicalClusteringMethodName; _profileAlignerName = profileAlignerMethodName; _profileProfileFunctionName = profileFunctionName; SimilarityMatrix = similarityMatrix; GapOpenCost = gapOpenPenalty; GapExtensionCost = gapExtendPenalty; MsaUtils.SetProfileItemSets(_alphabet); Performance.Snapshot("Start Aligning"); // Work... Align(sequences); }
public void TestMsaBenchMarkLargeDataset() { // Test on DNA benchmark dataset string filePathObj = @"TestUtils\BOX032Small.xml.afa".TestDir(); var orgSequences = new FastAParser().Parse(filePathObj).ToList(); var sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Assert.AreEqual(numberOfSequences, sequences.Count); string outputFilePath = Path.GetTempFileName(); try { using (StreamWriter writer = new StreamWriter(outputFilePath, true)) { foreach (ISequence sequence in sequences) { // write sequence writer.WriteLine(">" + sequence.ID); for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60) { writer.WriteLine(new String(sequence.Skip(lineStart).Take((int)Math.Min(60, sequence.Count - lineStart)).Select(a => (char)a).ToArray())); } writer.Flush(); } } sequences = new FastAParser().Parse(outputFilePath).ToList(); } finally { File.Delete(outputFilePath); } Console.WriteLine("Original sequences are:"); sequences.ForEach(Console.WriteLine); Console.WriteLine("Benchmark sequences are:"); orgSequences.ForEach(Console.WriteLine); // Begin alignment PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; int numberOfPartitions = 16; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(msa.AlignedSequencesA[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(msa.AlignedSequencesB[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(msa.AlignedSequencesC[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(msa.AlignedSequences[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); }
public void TestMsaBenchMarkOnSABmark() { List <float> allQ = new List <float>(); List <float> allTC = new List <float>(); string fileDirectory = @"TestUtils\Fasta\Protein\SABmark".TestDir(); DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; SimilarityMatrix similarityMatrix; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); foreach (DirectoryInfo fi in iD.GetDirectories()) { foreach (DirectoryInfo fii in fi.GetDirectories()) { foreach (FileInfo fiii in fii.GetFiles()) { String filePath = fiii.FullName; Console.WriteLine(filePath); FastAParser parser = new FastAParser(); IList <ISequence> orgSequences = parser.Parse(filePath).ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); Console.WriteLine("Original unaligned sequences are:"); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { //Console.WriteLine(msa.AlignedSequences[i].ToString()); } float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); allQ.Add(scoreQ); allTC.Add(scoreTC); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); if (allQ.Count % 1000 == 0) { Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); } } } } Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
/// <summary> /// Get Hierarchical Clustering using kmerdistancematrix\kimura distance matrix and hierarchical method name. /// </summary> /// <param name="distanceMatrix">distance matrix.</param> /// <param name="hierarchicalClusteringMethodName">Hierarchical clustering method name.</param> /// <returns>Hierarchical clustering</returns> private static IHierarchicalClustering GetHierarchicalClustering(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName) { // Hierarchical clustering IHierarchicalClustering hierarcicalClustering = new HierarchicalClusteringParallel(distanceMatrix, hierarchicalClusteringMethodName); return hierarcicalClustering; }
/// <summary> /// Validate Muscle multiple sequence alignment with static properties /// of PamsamMultipleSequenceAligner. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="hierarchicalClusteringMethodName"></param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName"></param> /// <param name="profileScoreName">Profile score function name.</param> /// <param name="useweights">use sequence weights true\false</param> /// <param name="fasterVersion">fasterversion true\false</param> /// <param name="useStageB">stage2 computation true\false</param> /// <param name="expectedScoreNode"></param> private void ValidatePamsamAlign(string nodeName, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName, bool useweights, bool fasterVersion, bool useStageB) { Initialize(nodeName, expectedScoreNode); // get old properties bool prevVersion = PAMSAMMultipleSequenceAligner.FasterVersion; bool prevUseWeights = PAMSAMMultipleSequenceAligner.UseWeights; bool prevUseStageB = PAMSAMMultipleSequenceAligner.UseStageB; try { // Set static properties PAMSAMMultipleSequenceAligner.FasterVersion = fasterVersion; PAMSAMMultipleSequenceAligner.UseWeights = useweights; PAMSAMMultipleSequenceAligner.UseStageB = useStageB; // MSA aligned sequences. int numberOfDegrees = 2; int numberOfPartitions = 2; var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfDegrees, numberOfPartitions); // Validate the aligned Sequence and score if (fasterVersion) { InitializeStage1Variables(nodeName); Assert.AreEqual(stage1ExpectedSequences.Count, msa.AlignedSequences.Count); int index = 0; foreach (ISequence seq in msa.AlignedSequences) { Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()), new string(stage1ExpectedSequences[index].Select(a => (char) a).ToArray())); index++; } Assert.IsTrue(stage1ExpectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null))); } else { int index = 0; foreach (ISequence seq in msa.AlignedSequences) { Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()), new string(expectedSequences[index].Select(a => (char) a).ToArray())); index++; } Assert.AreEqual(expectedScore, msa.AlignmentScore.ToString((IFormatProvider) null)); } } finally { // Reset it back PAMSAMMultipleSequenceAligner.FasterVersion = prevVersion; PAMSAMMultipleSequenceAligner.UseWeights = prevUseWeights; PAMSAMMultipleSequenceAligner.UseStageB = prevUseStageB; } ApplicationLog.WriteLine( String.Format(null, @"Validation of pamsam alignment completed successfully for molecule type {0} with static property fasterversion {0}, usestageb {1} and useweights {2}", fasterVersion, useStageB, useweights)); }
/// <summary> /// Validate Hierarchical Clustering for stage2 using kimura distance matrix and hierarchical method name /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="hierarchicalMethodName"></param> private void ValidateHierarchicalClusteringStage2(string nodeName, UpdateDistanceMethodsTypes hierarchicalMethodName) { Initialize(Constants.MuscleDnaSequenceNode, Constants.ExpectedScoreNode); List<ISequence> stage1AlignedSequences = GetStage1AlignedSequence(); IDistanceMatrix matrix = GetKimuraDistanceMatrix(stage1AlignedSequences); IHierarchicalClustering hierarcicalClustering = GetHierarchicalClustering(matrix, hierarchicalMethodName); ValidateHierarchicalClustering(nodeName, hierarcicalClustering.Nodes, hierarcicalClustering.Edges); ApplicationLog.WriteLine(String.Format(null, @"PamsamBvtTest:: herarchical clustering stage2 nodes and edges generation and validation completed success with different hierarchical clustering method name {0}", hierarchicalMethodName.ToString())); }
/// <summary> /// Validate Stage 3 aligned sequences and score of Muscle multiple sequence alignment. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName">SW/NW profiler</param> /// <param name="profileScoreName">Profile score function name.</param> private void ValidatePamsamAlignStage3(string nodeName, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName) { Initialize(nodeName, expectedScoreNode); // MSA aligned sequences. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, 2, 2); string expectedSeqString = expectedSequences.Aggregate(string.Empty, (current, seq) => current + (new string(seq.Select(a => (char) a).ToArray()) + ",")); foreach (ISequence seq in msa.AlignedSequencesC) { Assert.IsTrue(expectedSeqString.Contains(new string(seq.Select(a => (char) a).ToArray()))); } Assert.IsTrue(expectedScore.Contains(msa.AlignmentScoreC.ToString((IFormatProvider) null))); ApplicationLog.WriteLine(String.Format(null, "PamsamBvtTest:: Pamsam stage3 alignment completed successfully with all default params")); }
/// <summary> /// Validate Stage 2 aligned sequences and score of Muscle multiple sequence alignment. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName">SW/NW profiler</param> /// <param name="profileScoreName">Profile score function name.</param> private void ValidatePamsamAlignStage2(string nodeName, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName) { Initialize(nodeName, expectedScoreNode); InitializeStage2Variables(nodeName); // MSA aligned sequences. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, 2, 2); // Validate the aligned Sequence and score of stage2 if (null != msa.AlignedSequencesB) { Assert.AreEqual(stage2ExpectedSequences.Count, msa.AlignedSequencesB.Count); int index = 0; foreach (ISequence seq in msa.AlignedSequencesB) { Assert.AreEqual(new string(stage2ExpectedSequences[index].Select(a => (char) a).ToArray()), new string(seq.Select(a => (char) a).ToArray())); index++; } Assert.AreEqual(stage2ExpectedScore, msa.AlignmentScoreB.ToString((IFormatProvider) null)); } ApplicationLog.WriteLine(String.Format(null, "PamsamBvtTest:: Pamsam stage2 alignment completed successfully with all default params")); }
/// <summary> /// Construct an aligner and run the alignment. /// </summary> /// <param name="sequences">input sequences</param> /// <param name="kmerLength">positive integer of kmer length</param> /// <param name="distanceFunctionName">enum: distance function name</param> /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param> /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param> /// <param name="profileFunctionName">enum: profile-profile distance function</param> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="gapOpenPenalty">negative gapOpenPenalty</param> /// <param name="gapExtendPenalty">negative gapExtendPenalty</param> /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param> /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param> public PAMSAMMultipleSequenceAligner( IList<ISequence> sequences, int kmerLength, DistanceFunctionTypes distanceFunctionName, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, ProfileAlignerNames profileAlignerMethodName, ProfileScoreFunctionNames profileFunctionName, SimilarityMatrix similarityMatrix, int gapOpenPenalty, int gapExtendPenalty, int numberOfPartitions, int degreeOfParallelism) { AlignmentScoreC = float.MinValue; AlignmentScoreB = float.MinValue; AlignmentScoreA = float.MinValue; AlignmentScore = float.MinValue; StartLog(); if (null == sequences) { throw new ArgumentNullException("sequences"); } if (sequences.Count == 0) { throw new ArgumentException("Empty input sequences"); } // Set parallel extension option if (degreeOfParallelism <= 0) { throw new ArgumentException("Invalid parallel degree parameter"); } this.degreeOfParallelism = degreeOfParallelism; ParallelOption = new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism }; if (numberOfPartitions <= 0) { throw new ArgumentException("Invalid number of partition parameter"); } this.numberOfPartitions = numberOfPartitions; // Assign the alphabet SetAlphabet(sequences, similarityMatrix, false); // Initialize parameters KmerLength = kmerLength; DistanceFunctionName = distanceFunctionName; HierarchicalClusteringMethodName = hierarchicalClusteringMethodName; ProfileAlignerName = profileAlignerMethodName; ProfileProfileFunctionName = profileFunctionName; SimilarityMatrix = similarityMatrix; GapOpenCost = gapOpenPenalty; GapExtensionCost = gapExtendPenalty; MsaUtils.SetProfileItemSets(this.alphabet); ReportLog("Start Aligning"); // Work... DoAlignment(sequences); }
/// <summary> /// Validate Muscle multiple sequence alignment with different profiler and hierarchical clustering method name. /// </summary> /// <param name="nodeName">xml node name.</param> /// <param name="moleculeType">molecule type of sequences</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="profileName">SW/NW profiler</param> private void ValidatePamsamAlignWithUpdateDistanceMethodTypes(string nodeName, MoleculeType moleculeType, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, ProfileAlignerNames profileName) { ValidatePamsamAlign(nodeName, moleculeType, expectedScoreNode, hierarchicalClusteringMethodName, DistanceFunctionTypes.EuclideanDistance, profileName, ProfileScoreFunctionNames.WeightedInnerProduct, kmerLength, false, false); ApplicationLog.WriteLine( String.Format(null, "PamsamP1Test:: Pamsam alignment validation completed successfully for {0} moleculetype with different hierarchical clustering method name {1}", moleculeType.ToString(), hierarchicalClusteringMethodName.ToString())); }
private void ValidatePamsamAlign( string nodeName, MoleculeType moleculeType, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName, int kmrlength, bool addOnelineSequences, bool IsAlignForMoreSeq) { Initialize(nodeName, expectedScoreNode); if (addOnelineSequences) { AddOneLineSequences(nodeName); } // MSA aligned sequences. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmrlength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, 2, 2); // Validate the aligned Sequence and score int index = 0; foreach (ISequence seq in msa.AlignedSequences) { if (IsAlignForMoreSeq) { Assert.IsTrue(expectedSequences.Contains(seq)); index++; } } Assert.IsTrue(expectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null))); }
public void TestMuscleMultipleSequenceAlignmentRunningTime() { // Test on DNA benchmark dataset ISequenceParser parser = new FastaParser(); //string filepath = @"testdata\FASTA\RunningTime\122.afa"; string filepath = @"testdata\FASTA\RunningTime\BOX246.xml.afa"; MoleculeType mt = MoleculeType.Protein; IList <ISequence> orgSequences = parser.Parse(filepath); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); //filepath = @"testdata\FASTA\RunningTime\12_raw.afa"; //List<ISequence> sequences = parser.Parse(filepath); int numberOfSequences = orgSequences.Count; Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(sequences[i].ToString()); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(orgSequences[i].ToString()); } PAMSAMMultipleSequenceAligner.FasterVersion = true; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = null; switch (mt) { case (MoleculeType.DNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); break; case (MoleculeType.RNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); break; case (MoleculeType.Protein): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); break; default: throw new Exception("Invalid molecular type"); } PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("The number of partitions is: {0}", numberOfPartitions); Console.WriteLine("The number of degrees is: {0}", numberOfDegrees); Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(msa.AlignedSequencesA[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(msa.AlignedSequencesB[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(msa.AlignedSequencesC[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(msa.AlignedSequences[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); }
/// <summary> /// Construct an aligner and run the alignment. /// </summary> /// <param name="sequences">input sequences</param> /// <param name="kmerLength">positive integer of kmer length</param> /// <param name="distanceFunctionName">enum: distance function name</param> /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param> /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param> /// <param name="profileFunctionName">enum: profile-profile distance function</param> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="gapOpenPenalty">negative gapOpenPenalty</param> /// <param name="gapExtendPenalty">negative gapExtendPenalty</param> /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param> /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param> public PAMSAMMultipleSequenceAligner( IList <ISequence> sequences, int kmerLength, DistanceFunctionTypes distanceFunctionName, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, ProfileAlignerNames profileAlignerMethodName, ProfileScoreFunctionNames profileFunctionName, SimilarityMatrix similarityMatrix, int gapOpenPenalty, int gapExtendPenalty, int numberOfPartitions, int degreeOfParallelism) { Performance.Start(); if (null == sequences) { throw new ArgumentNullException("sequences"); } if (sequences.Count == 0) { throw new ArgumentException("Empty input sequences"); } // Set parallel extension option if (degreeOfParallelism <= 0) { throw new ArgumentException("Invalid parallel degree parameter"); } //_degreeOfParallelism = degreeOfParallelism; parallelOption = new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism }; if (numberOfPartitions <= 0) { throw new ArgumentException("Invalid number of partition parameter"); } _numberOfPartitions = numberOfPartitions; // Assign the alphabet SetAlphabet(sequences, similarityMatrix, false); // Initialize parameters KmerLength = kmerLength; DistanceFunctionName = distanceFunctionName; HierarchicalClusteringMethodName = hierarchicalClusteringMethodName; ProfileAlignerName = profileAlignerMethodName; ProfileProfileFunctionName = profileFunctionName; SimilarityMatrix = similarityMatrix; GapOpenCost = gapOpenPenalty; GapExtensionCost = gapExtendPenalty; MsaUtils.SetProfileItemSets(_alphabet); Performance.Snapshot("Start Aligning"); // Work... DoAlignment(sequences); }
/// <summary> /// Construct clusters using different update methods /// </summary> /// <param name="distanceMatrix">IDistanceMatrix</param> /// <param name="updateDistanceMethodName">enum EUpdateDistanceMethods</param> public HierarchicalClustering(IDistanceMatrix distanceMatrix, UpdateDistanceMethodsTypes updateDistanceMethodName) { if (distanceMatrix.Dimension <= 0) { throw new Exception("Invalid distance matrix dimension"); } try { // The number of nodes in the final tree is 2N-2: // N sequence nodes (leaves) and N-2 internal nodes // where N is the number of input sequences _nodes = new List<BinaryGuideTreeNode>(distanceMatrix.Dimension * 2 - 1); _edges = new List<BinaryGuideTreeEdge>(distanceMatrix.Dimension * 2 - 2); // The number of clusters is the number of leaves at the beginning // As the algorithm merges clusters, only one cluster remains. _clusters = new List<int>(distanceMatrix.Dimension); // Construct _indexToCluster _indexToCluster = new int[distanceMatrix.Dimension]; for (int i = 0; i < distanceMatrix.Dimension; ++i) { _indexToCluster[i] = i; } } catch (OutOfMemoryException ex) { throw new Exception("Out of memory", ex.InnerException); } // Choose a update-distance method switch(updateDistanceMethodName) { case(UpdateDistanceMethodsTypes.Average): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateAverage); break; case(UpdateDistanceMethodsTypes.Single): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateSingle); break; case(UpdateDistanceMethodsTypes.Complete): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateComplete); break; case(UpdateDistanceMethodsTypes.WeightedMAFFT): _updateDistanceMethod = new UpdateDistanceMethodSelector(UpdateWeightedMAFFT); break; default: throw new Exception("invalid update method"); } }
public void TestMsaBenchMarkLargeDataset() { // Test on DNA benchmark dataset ISequenceParser parser = new FastaParser(); string filepath = @"testdata\FASTA\Protein\Balibase\RV913\BOX032.xml.afa"; IList <ISequence> orgSequences = parser.Parse(filepath); IList <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; String outputFilePath = @"tempBOX032.xml.afa"; StreamWriter writer = new StreamWriter(outputFilePath, true); foreach (ISequence sequence in sequences) { writer.WriteLine(">" + sequence.ID); // write sequence BasicDerivedSequence derivedSeq = new BasicDerivedSequence(sequence, false, false, 0, 0); for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60) { derivedSeq.RangeStart = lineStart; derivedSeq.RangeLength = Math.Min(60, sequence.Count - lineStart); writer.WriteLine(derivedSeq.ToString()); } writer.Flush(); } writer.Close(); sequences.Clear(); sequences = parser.Parse(outputFilePath); Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(sequences[i].ToString()); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(orgSequences[i].ToString()); } PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, MoleculeType.Protein, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(msa.AlignedSequencesA[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(msa.AlignedSequencesB[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(msa.AlignedSequencesC[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(msa.AlignedSequences[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); if (File.Exists(outputFilePath)) { File.Delete(outputFilePath); } }
/// <summary> /// Validate Hierarchical Clustering for stage2 using kimura distance matrix /// and hierarchical method name /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="moleculeType">molecule type of sequences</param> /// <param name="hierarchicalMethodName">hierarchical method name</param> private void ValidateHierarchicalClusteringStage2(string nodeName, MoleculeType moleculeType, UpdateDistanceMethodsTypes hierarchicalMethodName) { switch (moleculeType) { case MoleculeType.DNA: Initialize(Constants.MuscleDnaSequenceNode, Constants.ExpectedScoreNode); break; case MoleculeType.Protein: Initialize(Constants.MuscleProteinSequenceNode, Constants.ExpectedScoreNode); break; case MoleculeType.RNA: Initialize(Constants.MuscleRnaSequenceNode, Constants.ExpectedScoreNode); break; default: break; } List<ISequence> stage1AlignedSequences = GetStage1AlignedSequence(moleculeType); // Get kimura distance matrix IDistanceMatrix matrix = GetKimuraDistanceMatrix(stage1AlignedSequences); // Get hierarchical clustering using method name IHierarchicalClustering hierarcicalClustering = GetHierarchicalClustering(matrix, hierarchicalMethodName); ValidateHierarchicalClustering(nodeName, hierarcicalClustering.Nodes, hierarcicalClustering.Edges); ApplicationLog.WriteLine(String.Format(null, @"PamsamP1Test:: hierarchical clustering stage2 nodes and edges generation and validation completed success for {0} moleculetype with different hierarchical clustering method name {1}", moleculeType.ToString(), hierarchicalMethodName.ToString())); }
public void TestMuscleMultipleSequenceAlignmentRunningTime() { string filepath = @"TestUtils\FASTA\RunningTime\BOX246.xml.afa"; // Test on DNA benchmark dataset FastAParser parser = new FastAParser(filepath); IList <ISequence> orgSequences = parser.Parse().ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); //filepath = @"TestUtils\FASTA\RunningTime\12_raw.afa"; //List<ISequence> sequences = parser.Parse(filepath); int numberOfSequences = orgSequences.Count; Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(sequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(orgSequences[i].Select(a => (char)a).ToArray())); } PAMSAMMultipleSequenceAligner.FasterVersion = true; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("The number of partitions is: {0}", numberOfPartitions); Console.WriteLine("The number of degrees is: {0}", numberOfDegrees); Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesB[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); ((FastAParser)parser).Dispose(); }
public void TestMsaBenchMarkLargeDataset() { string filepath = @"\TestUtils\BOX032Small.xml.afa"; string filePathObj = Directory.GetCurrentDirectory() + filepath; // Test on DNA benchmark dataset FastAParser parser = new FastAParser(filePathObj); IList <ISequence> orgSequences = parser.Parse().ToList(); IList <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; String outputFilePath = @"tempBOX032.xml.afa"; using (StreamWriter writer = new StreamWriter(outputFilePath, true)) { foreach (ISequence sequence in sequences) { writer.WriteLine(">" + sequence.ID); // write sequence for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60) { writer.WriteLine(new String(sequence.Skip(lineStart).Take((int)Math.Min(60, sequence.Count - lineStart)).Select(a => (char)a).ToArray())); } writer.Flush(); } } sequences.Clear(); parser = new FastAParser(outputFilePath); sequences = parser.Parse().ToList(); Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(sequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(orgSequences[i].Select(a => (char)a).ToArray())); } PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesB[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); ((FastAParser)parser).Dispose(); if (File.Exists(outputFilePath)) { File.Delete(outputFilePath); } }
public void TestMsaBenchMarkOnBralibase() { var allQ = new List <float>(); var allTC = new List <float>(); string fileDirectory = @"TestUtils\Fasta\RNA\k10".TestDir(); DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; var similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna);; int gapOpenPenalty = -20; int gapExtendPenalty = -5; int kmerLength = 4; int numberOfDegrees = 2; int numberOfPartitions = 16; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached; foreach (DirectoryInfo fi in iD.GetDirectories()) { foreach (FileInfo fiii in fi.GetFiles()) { String filePath = fiii.FullName; Console.WriteLine($"Loading: {filePath}"); var orgSequences = new FastAParser() { Alphabet = AmbiguousRnaAlphabet.Instance }.Parse(filePath).ToList(); var sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); allQ.Add(scoreQ); allTC.Add(scoreTC); if (allQ.Count % 1000 == 0) { Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); } } } Console.WriteLine("number of datasets is: {0}", allQ.Count); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
/// <summary> /// Validate Muscle multiple sequence alignment with different profiler and hierarchical clustering method name. /// </summary> /// <param name="nodeName">xml node name.</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="profileName">SW/NW profiler</param> /// <param name="isWeightedProduct">True if it of the WeightedProduct type else false.</param> private void ValidatePamsamAlignWithUpdateDistanceMethodTypes(string nodeName, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, ProfileAlignerNames profileName, bool isWeightedProduct) { ValidatePamsamAlign(nodeName, expectedScoreNode, hierarchicalClusteringMethodName, DistanceFunctionTypes.EuclideanDistance, profileName, ProfileScoreFunctionNames.InnerProduct, isWeightedProduct); ApplicationLog.WriteLine(String.Format(null, @"PamsamBvtTest:: Pamsam alignment validation completed successfully with different hierarchical clustering method name {0}", hierarchicalClusteringMethodName.ToString())); }