public void testBug3() { //Test on DNA benchmark dataset ISequenceParser parser = new FastaParser(); string filepath = @"TestUtils\122_raw.afa"; MoleculeType mt = MoleculeType.DNA; IList <ISequence> orgSequences = parser.Parse(filepath); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = null; switch (mt) { case (MoleculeType.DNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); break; case (MoleculeType.RNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); break; case (MoleculeType.Protein): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); break; default: throw new InvalidDataException("Invalid molecular type"); } //DateTime startTime = DateTime.Now; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Assert.IsNotNull(msa.AlignedSequences); ((FastaParser)parser).Dispose(); }
public void testBug2() { //Test on DNA benchmark dataset string filepath = @"TestUtils\122_raw.afa".TestDir(); FastAParser parser = new FastAParser(); IList <ISequence> orgSequences = parser.Parse(filepath).ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Assert.IsNotNull(msa.AlignedSequences); }
/// <summary> /// Constructor for NeedlemanWunschProfile Aligner. /// Sets default similarity matrix, gap penalties, and profile function name. /// Users will typically reset these using parameters specific to their particular sequences and needs. /// </summary> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="profileScoreFunctionName">enum: profileScoreFunctionName</param> /// <param name="gapOpenPenalty">negative integer</param> /// <param name="gapExtensionPenalty">negative integer</param> /// <param name="numberOfPartitions">positive integer</param> public NeedlemanWunschProfileAlignerParallel(SimilarityMatrix similarityMatrix, ProfileScoreFunctionNames profileScoreFunctionName, int gapOpenPenalty, int gapExtensionPenalty, int numberOfPartitions) : base(similarityMatrix, profileScoreFunctionName, gapOpenPenalty, gapExtensionPenalty, numberOfPartitions) { }
/// <summary> /// Constructor for NeedlemanWunschProfile Aligner. /// Sets default similarity matrix, gap penalties, and profile function name. /// Users will typically reset these using parameters specific to their particular sequences and needs. /// </summary> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="profileScoreFunctionName">enum: profileScoreFunctionName</param> /// <param name="gapOpenPenalty">negative integer</param> /// <param name="gapExtensionPenalty">negative integer</param> /// <param name="numberOfPartitions">positive integer</param> public NeedlemanWunschProfileAlignerSerial(SimilarityMatrix similarityMatrix, ProfileScoreFunctionNames profileScoreFunctionName, int gapOpenPenalty, int gapExtensionPenalty, int numberOfPartitions) : base(similarityMatrix, profileScoreFunctionName, gapOpenPenalty, gapExtensionPenalty, numberOfPartitions) { }
/// <summary> /// Constructor for SmithWatermanProfileAligner. /// Sets default similarity matrix, gap penalties, and profile function name. /// Users will typically reset these using parameters specific to their particular sequences and needs. /// </summary> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="profileScoreFunctionName">enum: profileScoreFunctionName</param> /// <param name="gapOpenPenalty">negative integer</param> /// <param name="gapExtensionPenalty">negative integer</param> /// <param name="numberOfPartitions">positive integer</param> public SmithWatermanProfileAlignerSerial(SimilarityMatrix similarityMatrix, ProfileScoreFunctionNames profileScoreFunctionName, int gapOpenPenalty, int gapExtensionPenalty, int numberOfPartitions) : base(similarityMatrix, profileScoreFunctionName, gapOpenPenalty, gapExtensionPenalty, numberOfPartitions) { }
/// <summary> /// Constructor for SmithWatermanProfileAligner Aligner. /// Sets default similarity matrix, gap penalties, and profile function name. /// Users will typically reset these using parameters specific to their particular sequences and needs. /// </summary> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="profileScoreFunctionName">enum: profileScoreFunctionName</param> /// <param name="gapOpenPenalty">negative integer</param> /// <param name="gapExtensionPenalty">negative integer</param> /// <param name="numberOfPartitions">positive integer</param> public SmithWatermanProfileAlignerParallel(SimilarityMatrix similarityMatrix, ProfileScoreFunctionNames profileScoreFunctionName, int gapOpenPenalty, int gapExtensionPenalty, int numberOfPartitions) : base(similarityMatrix, profileScoreFunctionName, gapOpenPenalty, gapExtensionPenalty, numberOfPartitions) { }
public void TestMuscleMultipleSequenceAlignment() { ISequence templateSequence = new Sequence(Alphabets.DNA, "ATGCSWRYKMBVHDN-"); Dictionary <ISequenceItem, int> itemSet = new Dictionary <ISequenceItem, int>(); for (int i = 0; i < templateSequence.Count; ++i) { itemSet.Add(templateSequence[i], i); } Profiles.ItemSet = itemSet; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrices.AmbiguousDna); int gapOpenPenalty = -8; int gapExtendPenalty = -1; int kmerLength = 3; ISequence seqA = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"); ISequence seqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"); ISequence seqC = new Sequence(Alphabets.DNA, "GGGACAAAATCAG"); List <ISequence> sequences = new List <ISequence>(); sequences.Add(seqA); sequences.Add(seqB); sequences.Add(seqC); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclieanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Aaverage; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; MuscleMultipleSequenceAlignment msa = new MuscleMultipleSequenceAlignment (sequences, MoleculeType.DNA, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty); ISequence expectedSeqA = new Sequence(Alphabets.DNA, "GGGA---AAAATCAGATT"); ISequence expectedSeqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG---"); ISequence expectedSeqC = new Sequence(Alphabets.DNA, "GGGA--CAAAATCAG---"); Assert.AreEqual(expectedSeqA.ToString(), msa.AlignedSequences[0].ToString()); Assert.AreEqual(expectedSeqB.ToString(), msa.AlignedSequences[1].ToString()); Assert.AreEqual(expectedSeqC.ToString(), msa.AlignedSequences[2].ToString()); Assert.AreEqual(46, msa.AlignmentScore); }
// $TODO: Change the above namespace after PhaseOne changes /// <summary> /// Aligns multiple sequences using a multiple sequence aligner. /// This sample uses PAMSAM with a set of default parameters. /// </summary> /// <param name="sequences">List of sequences to align.</param> /// <returns>List of ISequenceAlignment</returns> static IList <ISequence> DoMultipleSequenceAlignment(List <ISequence> sequences) { // $TODO: Change the signature after PAMSAM PhaseOne is checked in // Initialise objects for constructor // $TODO: Change this after PAMSAM PhaseOne is checked in SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); int gapOpenPenalty = -4; int gapExtendPenalty = -1; int kmerLength = 3; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; // Call aligner PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount * 2, Environment.ProcessorCount); return(msa.AlignedSequences); }
/// <summary> /// Construct an aligner and run the alignment. /// </summary> /// <param name="sequences">input sequences</param> /// <param name="kmerLength">positive integer of kmer length</param> /// <param name="distanceFunctionName">enum: distance function name</param> /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param> /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param> /// <param name="profileFunctionName">enum: profile-profile distance function</param> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="gapOpenPenalty">negative gapOpenPenalty</param> /// <param name="gapExtendPenalty">negative gapExtendPenalty</param> /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param> /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param> public PAMSAMMultipleSequenceAligner( IList<ISequence> sequences, int kmerLength, DistanceFunctionTypes distanceFunctionName, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, ProfileAlignerNames profileAlignerMethodName, ProfileScoreFunctionNames profileFunctionName, SimilarityMatrix similarityMatrix, int gapOpenPenalty, int gapExtendPenalty, int numberOfPartitions, int degreeOfParallelism) { AlignmentScoreC = float.MinValue; AlignmentScoreB = float.MinValue; AlignmentScoreA = float.MinValue; AlignmentScore = float.MinValue; StartLog(); if (null == sequences) { throw new ArgumentNullException("sequences"); } if (sequences.Count == 0) { throw new ArgumentException("Empty input sequences"); } // Set parallel extension option if (degreeOfParallelism <= 0) { throw new ArgumentException("Invalid parallel degree parameter"); } this.degreeOfParallelism = degreeOfParallelism; ParallelOption = new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism }; if (numberOfPartitions <= 0) { throw new ArgumentException("Invalid number of partition parameter"); } this.numberOfPartitions = numberOfPartitions; // Assign the alphabet SetAlphabet(sequences, similarityMatrix, false); // Initialize parameters KmerLength = kmerLength; DistanceFunctionName = distanceFunctionName; HierarchicalClusteringMethodName = hierarchicalClusteringMethodName; ProfileAlignerName = profileAlignerMethodName; ProfileProfileFunctionName = profileFunctionName; SimilarityMatrix = similarityMatrix; GapOpenCost = gapOpenPenalty; GapExtensionCost = gapExtendPenalty; MsaUtils.SetProfileItemSets(this.alphabet); ReportLog("Start Aligning"); // Work... DoAlignment(sequences); }
/// <summary> /// Validate Stage 3 aligned sequences and score of Muscle multiple sequence alignment. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName">SW/NW profiler</param> /// <param name="profileScoreName">Profile score function name.</param> private void ValidatePamsamAlignStage3(string nodeName, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName) { Initialize(nodeName, expectedScoreNode); // MSA aligned sequences. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, 2, 2); string expectedSeqString = expectedSequences.Aggregate(string.Empty, (current, seq) => current + (new string(seq.Select(a => (char) a).ToArray()) + ",")); foreach (ISequence seq in msa.AlignedSequencesC) { Assert.IsTrue(expectedSeqString.Contains(new string(seq.Select(a => (char) a).ToArray()))); } Assert.IsTrue(expectedScore.Contains(msa.AlignmentScoreC.ToString((IFormatProvider) null))); ApplicationLog.WriteLine(String.Format(null, "PamsamBvtTest:: Pamsam stage3 alignment completed successfully with all default params")); }
public void TestMsaBenchMark() { string fileDirectory = @"TestUtils\FASTA\Protein\Balibase\RV911\"; DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; SimilarityMatrix similarityMatrix; int gapOpenPenalty = -20; int gapExtendPenalty = -5; int kmerLength = 4; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached; similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); List <float> allQ = new List <float>(); List <float> allTC = new List <float>(); foreach (FileInfo fi in iD.GetFiles()) { String filePath = fi.FullName; Console.WriteLine(filePath); FastAParser parser = new FastAParser(filePath); parser.Alphabet = AmbiguousProteinAlphabet.Instance; IList <ISequence> orgSequences = parser.Parse().ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); Console.WriteLine("Original unaligned sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { //Console.WriteLine(sequences[i].ToString()); } Console.WriteLine("Original aligned sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { //Console.WriteLine(orgSequences[i].ToString()); } PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { //Console.WriteLine(msa.AlignedSequencesA[i].ToString()); } Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { //Console.WriteLine(msa.AlignedSequencesB[i].ToString()); } Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { //Console.WriteLine(msa.AlignedSequencesC[i].ToString()); } Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { //Console.WriteLine(msa.AlignedSequences[i].ToString()); } float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); allQ.Add(scoreQ); allTC.Add(scoreTC); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); ((FastAParser)parser).Dispose(); } Console.WriteLine("Number of datasets is: {0}", allQ.Count); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMsaBenchMarkOnBralibase() { List <float> allQ = new List <float>(); List <float> allTC = new List <float>(); string fileDirectory = @"testData\FASTA\RNA\k10"; DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; MoleculeType mt = MoleculeType.RNA; SimilarityMatrix similarityMatrix; int gapOpenPenalty = -20; int gapExtendPenalty = -5; int kmerLength = 4; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached; switch (mt) { case (MoleculeType.DNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); break; case (MoleculeType.RNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); break; case (MoleculeType.Protein): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); break; default: throw new Exception("Invalid molecular type"); } foreach (DirectoryInfo fi in iD.GetDirectories()) { foreach (FileInfo fiii in fi.GetFiles()) { String filePath = fiii.FullName; Console.WriteLine(filePath); ISequenceParser parser = new FastaParser(); IList <ISequence> orgSequences = parser.Parse(filePath); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); Console.WriteLine("Original unaligned sequences are:"); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { //Console.WriteLine(msa.AlignedSequences[i].ToString()); } float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); allQ.Add(scoreQ); allTC.Add(scoreTC); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); if (allQ.Count % 1000 == 0) { Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); } } } Console.WriteLine("number of datasets is: {0}", allQ.Count); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMuscleMultipleSequenceAlignment() { SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); int gapOpenPenalty = -4; int gapExtendPenalty = -1; int kmerLength = 3; ISequence seqA = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"); ISequence seqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"); ISequence seqC = new Sequence(Alphabets.DNA, "GGGACAAAATCAG"); List <ISequence> sequences = new List <ISequence>(); sequences.Add(seqA); sequences.Add(seqB); sequences.Add(seqC); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount * 2, Environment.ProcessorCount); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } // Test case 2 Console.WriteLine("Example 2"); sequences = new List <ISequence>(); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAATCG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCTTATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount * 2, Environment.ProcessorCount); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } // Test case e Console.WriteLine("Example 2"); sequences = new List <ISequence>(); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAATCG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCTTATCAG")); msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount * 2, Environment.ProcessorCount); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } }
private void ValidatePamsamAlign( string nodeName, MoleculeType moleculeType, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName, int kmrlength, bool addOnelineSequences, bool IsAlignForMoreSeq) { Initialize(nodeName, expectedScoreNode); if (addOnelineSequences) { AddOneLineSequences(nodeName); } // MSA aligned sequences. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmrlength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, 2, 2); // Validate the aligned Sequence and score int index = 0; foreach (ISequence seq in msa.AlignedSequences) { if (IsAlignForMoreSeq) { Assert.IsTrue(expectedSequences.Contains(seq)); index++; } } Assert.IsTrue(expectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null))); }
/// <summary> /// Construct an aligner /// </summary> /// <param name="sequences">input sequences</param> /// <param name="kmerLength">positive integer of kmer length</param> /// <param name="distanceFunctionName">enum: distance function name</param> /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param> /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param> /// <param name="profileFunctionName">enum: profile-profile distance function</param> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="gapOpenPenalty">negative gapOpenPenalty</param> /// <param name="gapExtendPenalty">negative gapExtendPenalty</param> /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param> /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param> public PAMSAMMultipleSequenceAligner( IList <ISequence> sequences, int kmerLength, DistanceFunctionTypes distanceFunctionName, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, ProfileAlignerNames profileAlignerMethodName, ProfileScoreFunctionNames profileFunctionName, SimilarityMatrix similarityMatrix, int gapOpenPenalty, int gapExtendPenalty, int numberOfPartitions, int degreeOfParallelism) { Performance.Start(); if (null == sequences) { throw new ArgumentNullException("sequences"); } if (sequences.Count == 0) { throw new ArgumentException("Empty input sequences"); } // Set parallel extension option if (degreeOfParallelism <= 0) { throw new ArgumentException("Invalid parallel degree parameter"); } PAMSAMMultipleSequenceAligner.parallelOption = new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism }; if (numberOfPartitions <= 0) { throw new ArgumentException("Invalid number of partition parameter"); } _numberOfPartitions = numberOfPartitions; // Validate data type _alphabet = sequences[0].Alphabet; Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.parallelOption, i => { if (!Alphabets.CheckIsFromSameBase(sequences[i].Alphabet, _alphabet)) { throw new ArgumentException("Inconsistent sequence alphabet"); } }); List <String> similarityMatrixDNA = new List <String>(); similarityMatrixDNA.Add("AmbiguousDNA"); List <String> similarityMatrixRNA = new List <String>(); similarityMatrixRNA.Add("AmbiguousRNA"); List <String> similarityMatrixProtein = new List <String>(); similarityMatrixProtein.Add("BLOSUM45"); similarityMatrixProtein.Add("BLOSUM50"); similarityMatrixProtein.Add("BLOSUM62"); similarityMatrixProtein.Add("BLOSUM80"); similarityMatrixProtein.Add("BLOSUM90"); similarityMatrixProtein.Add("PAM250"); similarityMatrixProtein.Add("PAM30"); similarityMatrixProtein.Add("PAM70"); if (_alphabet is DnaAlphabet) { if (!similarityMatrixDNA.Contains(similarityMatrix.Name)) { throw new ArgumentException("Inconsistent similarity matrix"); } } else if (_alphabet is ProteinAlphabet) { if (!similarityMatrixProtein.Contains(similarityMatrix.Name)) { throw new ArgumentException("Inconsistent similarity matrix"); } } else if (_alphabet is RnaAlphabet) { if (!similarityMatrixRNA.Contains(similarityMatrix.Name)) { throw new ArgumentException("Inconsistent similarity matrix"); } } else { throw new ArgumentException("Invalid alphabet"); } // Initialize parameters _kmerLength = kmerLength; _distanceFunctionName = distanceFunctionName; _hierarchicalClusteringMethodName = hierarchicalClusteringMethodName; _profileAlignerName = profileAlignerMethodName; _profileProfileFunctionName = profileFunctionName; SimilarityMatrix = similarityMatrix; GapOpenCost = gapOpenPenalty; GapExtensionCost = gapExtendPenalty; MsaUtils.SetProfileItemSets(_alphabet); Performance.Snapshot("Start Aligning"); // Work... Align(sequences); }
public void TestMuscleMultipleSequenceAlignmentRunningTime() { // Test on DNA benchmark dataset ISequenceParser parser = new FastaParser(); //string filepath = @"testdata\FASTA\RunningTime\122.afa"; string filepath = @"testdata\FASTA\RunningTime\BOX246.xml.afa"; MoleculeType mt = MoleculeType.Protein; IList <ISequence> orgSequences = parser.Parse(filepath); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); //filepath = @"testdata\FASTA\RunningTime\12_raw.afa"; //List<ISequence> sequences = parser.Parse(filepath); int numberOfSequences = orgSequences.Count; Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(sequences[i].ToString()); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(orgSequences[i].ToString()); } PAMSAMMultipleSequenceAligner.FasterVersion = true; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = null; switch (mt) { case (MoleculeType.DNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); break; case (MoleculeType.RNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); break; case (MoleculeType.Protein): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); break; default: throw new Exception("Invalid molecular type"); } PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("The number of partitions is: {0}", numberOfPartitions); Console.WriteLine("The number of degrees is: {0}", numberOfDegrees); Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(msa.AlignedSequencesA[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(msa.AlignedSequencesB[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(msa.AlignedSequencesC[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(msa.AlignedSequences[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); }
/// <summary> /// Constructor for all the pairwise aligner (NeedlemanWunsch, SmithWaterman, Overlap). /// Sets default similarity matrix and gap penalties. /// Users will typically reset these using parameters specific to their particular sequences and needs. /// /// This constructor is for non-parallel version. /// </summary> public DynamicProgrammingProfileAlignerParallel( SimilarityMatrix similarityMatrix, ProfileScoreFunctionNames profileScoreFunctionName, int gapOpenPenalty, int gapExtensionPenalty) : this(similarityMatrix, profileScoreFunctionName, gapOpenPenalty, gapExtensionPenalty, 1) { }
public void TestMsaBenchMarkOnSABmark() { List <float> allQ = new List <float>(); List <float> allTC = new List <float>(); string fileDirectory = @"TestUtils\Fasta\Protein\SABmark".TestDir(); DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; SimilarityMatrix similarityMatrix; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); foreach (DirectoryInfo fi in iD.GetDirectories()) { foreach (DirectoryInfo fii in fi.GetDirectories()) { foreach (FileInfo fiii in fii.GetFiles()) { String filePath = fiii.FullName; Console.WriteLine(filePath); FastAParser parser = new FastAParser(); IList <ISequence> orgSequences = parser.Parse(filePath).ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); Console.WriteLine("Original unaligned sequences are:"); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { //Console.WriteLine(msa.AlignedSequences[i].ToString()); } float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); allQ.Add(scoreQ); allTC.Add(scoreTC); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); if (allQ.Count % 1000 == 0) { Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); } } } } Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMsaBenchMarkOnBralibase() { var allQ = new List <float>(); var allTC = new List <float>(); string fileDirectory = @"TestUtils\Fasta\RNA\k10".TestDir(); DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; var similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna);; int gapOpenPenalty = -20; int gapExtendPenalty = -5; int kmerLength = 4; int numberOfDegrees = 2; int numberOfPartitions = 16; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached; foreach (DirectoryInfo fi in iD.GetDirectories()) { foreach (FileInfo fiii in fi.GetFiles()) { String filePath = fiii.FullName; Console.WriteLine($"Loading: {filePath}"); var orgSequences = new FastAParser() { Alphabet = AmbiguousRnaAlphabet.Instance }.Parse(filePath).ToList(); var sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); allQ.Add(scoreQ); allTC.Add(scoreTC); if (allQ.Count % 1000 == 0) { Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); } } } Console.WriteLine("number of datasets is: {0}", allQ.Count); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMsaBenchMarkLargeDataset() { // Test on DNA benchmark dataset string filePathObj = @"TestUtils\BOX032Small.xml.afa".TestDir(); var orgSequences = new FastAParser().Parse(filePathObj).ToList(); var sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Assert.AreEqual(numberOfSequences, sequences.Count); string outputFilePath = Path.GetTempFileName(); try { using (StreamWriter writer = new StreamWriter(outputFilePath, true)) { foreach (ISequence sequence in sequences) { // write sequence writer.WriteLine(">" + sequence.ID); for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60) { writer.WriteLine(new String(sequence.Skip(lineStart).Take((int)Math.Min(60, sequence.Count - lineStart)).Select(a => (char)a).ToArray())); } writer.Flush(); } } sequences = new FastAParser().Parse(outputFilePath).ToList(); } finally { File.Delete(outputFilePath); } Console.WriteLine("Original sequences are:"); sequences.ForEach(Console.WriteLine); Console.WriteLine("Benchmark sequences are:"); orgSequences.ForEach(Console.WriteLine); // Begin alignment PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; int numberOfPartitions = 16; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(msa.AlignedSequencesA[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(msa.AlignedSequencesB[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(msa.AlignedSequencesC[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(msa.AlignedSequences[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); }
/// <summary> /// Validate Muscle multiple sequence alignment with static properties /// of PamsamMultipleSequenceAligner. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="hierarchicalClusteringMethodName"></param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName"></param> /// <param name="profileScoreName">Profile score function name.</param> /// <param name="useweights">use sequence weights true\false</param> /// <param name="fasterVersion">fasterversion true\false</param> /// <param name="useStageB">stage2 computation true\false</param> /// <param name="expectedScoreNode"></param> private void ValidatePamsamAlign(string nodeName, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName, bool useweights, bool fasterVersion, bool useStageB) { Initialize(nodeName, expectedScoreNode); // get old properties bool prevVersion = PAMSAMMultipleSequenceAligner.FasterVersion; bool prevUseWeights = PAMSAMMultipleSequenceAligner.UseWeights; bool prevUseStageB = PAMSAMMultipleSequenceAligner.UseStageB; try { // Set static properties PAMSAMMultipleSequenceAligner.FasterVersion = fasterVersion; PAMSAMMultipleSequenceAligner.UseWeights = useweights; PAMSAMMultipleSequenceAligner.UseStageB = useStageB; // MSA aligned sequences. int numberOfDegrees = 2; int numberOfPartitions = 2; var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfDegrees, numberOfPartitions); // Validate the aligned Sequence and score if (fasterVersion) { InitializeStage1Variables(nodeName); Assert.AreEqual(stage1ExpectedSequences.Count, msa.AlignedSequences.Count); int index = 0; foreach (ISequence seq in msa.AlignedSequences) { Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()), new string(stage1ExpectedSequences[index].Select(a => (char) a).ToArray())); index++; } Assert.IsTrue(stage1ExpectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null))); } else { int index = 0; foreach (ISequence seq in msa.AlignedSequences) { Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()), new string(expectedSequences[index].Select(a => (char) a).ToArray())); index++; } Assert.AreEqual(expectedScore, msa.AlignmentScore.ToString((IFormatProvider) null)); } } finally { // Reset it back PAMSAMMultipleSequenceAligner.FasterVersion = prevVersion; PAMSAMMultipleSequenceAligner.UseWeights = prevUseWeights; PAMSAMMultipleSequenceAligner.UseStageB = prevUseStageB; } ApplicationLog.WriteLine( String.Format(null, @"Validation of pamsam alignment completed successfully for molecule type {0} with static property fasterversion {0}, usestageb {1} and useweights {2}", fasterVersion, useStageB, useweights)); }
/// <summary> /// Creates binarytree using stage1 sequences and /// cut the binary tree at an random edge to get two profiles. /// Create NeedlemanWunschProfileAlignerSerial\Parallel instance /// according to degree of parallelism /// and using profile function score . Execute Align() method. /// Validates the IProfileAlignment properties. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="degreeOfParallelism">if 1 it is serial Profiler else parallel profiler</param> /// <param name="edgeIndex">edge index to cut the tree</param> /// <param name="profileFunction">profile function score name</param> private void ValidateProfileAlignerAlignWithProfileFunctionScore( string nodeName, int degreeOfParallelism, ProfileScoreFunctionNames profileFunction, int edgeIndex) { Initialize(Constants.MuscleDnaSequenceNode, Constants.ExpectedScoreNode); InitializeStage2Variables(Constants.MuscleDnaSequenceNode); // Get Stage2 Binary Tree List<ISequence> stage1AlignedSequences = GetStage1AlignedSequence(); IDistanceMatrix matrix = GetKimuraDistanceMatrix(stage1AlignedSequences); IHierarchicalClustering hierarcicalClustering = GetHierarchicalClustering(matrix); BinaryGuideTree binaryTree = GetBinaryTree(hierarcicalClustering); // Cut Tree at an edge and get sequences. List<int>[] leafNodeIndices = binaryTree.SeparateSequencesByCuttingTree(edgeIndex); // Extract profiles List<int>[] removedPositions = null; IProfileAlignment[] separatedProfileAlignments = ProfileAlignment.ProfileExtraction( stage2ExpectedSequences, leafNodeIndices[0], leafNodeIndices[1], out removedPositions); IProfileAligner aligner = null; if (1 == degreeOfParallelism) { aligner = new NeedlemanWunschProfileAlignerSerial(similarityMatrix, profileFunction, gapOpenPenalty, gapExtendPenalty, 2); } else { if (Environment.ProcessorCount >= degreeOfParallelism) { aligner = new NeedlemanWunschProfileAlignerParallel(similarityMatrix, profileFunction, gapOpenPenalty, gapExtendPenalty, 2); } else { ApplicationLog.WriteLine(String.Format(null, @"PamsamBvtTest: NeedlemanWunschProfileAlignerParallel could not be instantiated as number of processor is {0} and degree of parallelism {1}", Environment.ProcessorCount.ToString((IFormatProvider) null), degreeOfParallelism)); } } if (null != aligner) { IProfileAlignment profileAlignment = aligner.Align(separatedProfileAlignments[0], separatedProfileAlignments[0]); // Validate profile alignement string expectedRowSize = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.RowSize); string expectedColSize = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.ColumnSize); Assert.IsTrue( expectedColSize.Contains(profileAlignment.ProfilesMatrix.ColumnSize.ToString((IFormatProvider) null))); Assert.IsTrue( expectedRowSize.Contains(profileAlignment.ProfilesMatrix.RowSize.ToString((IFormatProvider) null))); ApplicationLog.WriteLine(String.Format(null, @"PamsamBvtTest: {0} Align() method validation completed successfully with number of processor is {1} and degree of parallelism {2}", profileAligner, Environment.ProcessorCount.ToString((IFormatProvider) null), degreeOfParallelism)); } else { Assert.Fail("Profile Aligner is not instantiated"); } }
public void PerformPAMSAMPerf() { Stopwatch _watchObj = new Stopwatch(); // Get input values from XML. string refPath = Utility._xmlUtil.GetTextValue(Constants.PamsamNode, Constants.RefFilePathNode); string queryPath = Utility._xmlUtil.GetTextValue(Constants.PamsamNode, Constants.QueryFilePathNode); // Create a List for input files. List <string> lstInputFiles = new List <string>(); lstInputFiles.Add(refPath); lstInputFiles.Add(queryPath); // Parse a Reference and query sequence file. ISequenceParser parser = new FastaParser(); IList <ISequence> refsequences = parser.Parse(queryPath); IList <ISequence> orgSequences = parser.Parse(refPath); // Execute UnAlign method to verify that it does not contains gap List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); // Set static properties PAMSAMMultipleSequenceAligner.FasterVersion = true; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; // Set Alignment parameters. int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; int numberOfPartitions = 4; // Profile Distance function name DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; // Set Hierarchical clustering. UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; // Set NeedlemanWunschProfileAligner ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProduct; // Create similarity matrix instance. SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); // Reset stop watch and start timer. _watchObj.Reset(); _watchObj.Start(); long memoryStart = GC.GetTotalMemory(true); // Parallel Option will only get set if the PAMSAMMultipleSequenceAligner is getting called // To test separately distance matrix, binary tree etc.. // Set the parallel option using below ctor. msa = new PAMSAMMultipleSequenceAligner (sequences, MoleculeType.DNA, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); // Stop watchclock. _watchObj.Stop(); long memoryEnd = GC.GetTotalMemory(true); string memoryUsed = (memoryEnd - memoryStart).ToString(); // Display all aligned sequence, performance and memory optimization nos. DisplayTestCaseHeader(lstInputFiles, _watchObj, memoryUsed, "PAMSAM"); Console.WriteLine(string.Format( "PAMSAM SequenceAligner method, Alignment Score is : {0}", msa.AlignmentScore.ToString())); int index = 0; foreach (ISequence seq in msa.AlignedSequences) { Console.WriteLine(string.Format( "PAMSAM Aligned Seq {0}:{1}", index, seq.ToString())); index++; } }
/// <summary> /// Validate Muscle multiple sequence alignment with different /// profiler and profile score function name. /// </summary> /// <param name="nodeName">xml node name.</param> /// <param name="moleculeType">molecule type of sequences</param> /// <param name="expectedScoreNode">expected score xml node</param> /// <param name="profileName">SW/NW profiler</param> /// <param name="profileScoreFunctionName">Profile score function name</param> private void ValidatePamsamAlignWithProfileScoreFunctionName(string nodeName, MoleculeType moleculeType, string expectedScoreNode, ProfileAlignerNames profileName, ProfileScoreFunctionNames profileScoreFunctionName) { ValidatePamsamAlign(nodeName, moleculeType, expectedScoreNode, UpdateDistanceMethodsTypes.Average, DistanceFunctionTypes.EuclideanDistance, profileName, profileScoreFunctionName, kmerLength, false, false); ApplicationLog.WriteLine(String.Format(null, @"PamsamP1Test:: Pamsam alignment validation completed successfully for {0} moleculetype with different profile score function name {1}", moleculeType.ToString(), profileScoreFunctionName.ToString())); }
/// <summary> /// Validate Muscle multiple sequence alignment with gap open cost and penalty. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="moleculeType">molecule type</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName">SW/NW profiler</param> /// <param name="profileScoreName">Profile score function name.</param> /// <param name="gpOpenPenalty">Gap open penalty</param> /// <param name="gpExtendPenalty">Gap extended penalty</param> /// <param name="IsAlignedLargeSeq">True for large sequence else false</param> private void ValidatePamsamAlignWithGapCost( string nodeName, MoleculeType moleculeType, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName, int gpOpenPenalty, int gpExtendPenalty, bool IsAlignedLargeSeq) { Initialize(nodeName, expectedScoreNode); // MSA aligned sequences with sepcified gap costs. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gpOpenPenalty, gpExtendPenalty, 2, 2); // Validate the aligned Sequence and score int index = 0; foreach (ISequence seq in msa.AlignedSequences) { if (IsAlignedLargeSeq) { Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()), new string(expectedSequences[index].Select(a => (char) a).ToArray())); index++; } } Assert.IsTrue(expectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null))); ApplicationLog.WriteLine(String.Format(null, "PamsamP1Test:: Pamsam alignment completed successfully with equal gap cost for {0} moleculetype with all default params", moleculeType.ToString())); }
/// <summary> /// Constructor for all the pairwise aligner (NeedlemanWunsch, SmithWaterman, Overlap). /// Sets default similarity matrix and gap penalties. /// Users will typically reset these using parameters specific to their particular sequences and needs. /// </summary> protected DynamicProgrammingProfileAlignerSerial( SimilarityMatrix similarityMatrix, ProfileScoreFunctionNames profileScoreFunctionName, int gapOpenPenalty, int gapExtensionPenalty, int numberOfCores) { // Set default similarity matrix and gap penalty. // User will typically choose their own parameters, these defaults are reasonable for many cases. // Molecule type is set to protein, since this will also work for DNA and RNA in the // special case of a diagonal similarity matrix. _similarityMatrix = similarityMatrix; _gapOpenPenalty = gapOpenPenalty; _gapExtensionPenalty = gapExtensionPenalty; switch (profileScoreFunctionName) { case (ProfileScoreFunctionNames.InnerProduct): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(InnerProduct); break; case (ProfileScoreFunctionNames.WeightedInnerProduct): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(WeightedInnerProduct); break; case (ProfileScoreFunctionNames.WeightedInnerProductShifted): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(WeightedInnerProductShifted); break; case (ProfileScoreFunctionNames.InnerProductFast): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(InnerProductFast); break; case (ProfileScoreFunctionNames.WeightedInnerProductFast): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(WeightedInnerProductFast); break; case (ProfileScoreFunctionNames.WeightedInnerProductShiftedFast): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(WeightedInnerProductShiftedFast); break; case (ProfileScoreFunctionNames.PearsonCorrelation): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(PearsonCorrelation); break; case (ProfileScoreFunctionNames.WeightedEuclideanDistance): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(WeightedEuclideanDistance); break; case (ProfileScoreFunctionNames.LogExponentialInnerProduct): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(LogExponentialInnerProduct); break; case (ProfileScoreFunctionNames.LogExponentialInnerProductShifted): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(LogExponentialInnerProductShifted); break; case (ProfileScoreFunctionNames.WeightedEuclideanDistanceFast): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(WeightedEuclideanDistanceFast); break; case (ProfileScoreFunctionNames.LogExponentialInnerProductFast): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(LogExponentialInnerProductFast); break; case (ProfileScoreFunctionNames.LogExponentialInnerProductShiftedFast): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(LogExponentialInnerProductShiftedFast); break; case (ProfileScoreFunctionNames.SymmetrizedEntropy): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(SymmetrizedEntropy); break; case (ProfileScoreFunctionNames.JensenShannonDivergence): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(JensenShannonDivergence); break; case (ProfileScoreFunctionNames.WeightedInnerProductCached): _profileProfileScoreFunction = new ProfileScoreFunctionSelector(WeightedInnerProductCached); _cachingFunction = new CachingFunctionSelector(CachingWeightedInnerProduct); _doCaching = true; break; default: throw new Exception("Invalid profile function name"); } if (numberOfCores <= 0) { throw new ArgumentException("Invalid number of cores parameter"); } _numberOfPartitions = numberOfCores; }
/// <summary> /// Construct an aligner and run the alignment. /// </summary> /// <param name="sequences">input sequences</param> /// <param name="kmerLength">positive integer of kmer length</param> /// <param name="distanceFunctionName">enum: distance function name</param> /// <param name="hierarchicalClusteringMethodName">enum: cluster update method</param> /// <param name="profileAlignerMethodName">enum: profile-profile aligner name</param> /// <param name="profileFunctionName">enum: profile-profile distance function</param> /// <param name="similarityMatrix">similarity matrix</param> /// <param name="gapOpenPenalty">negative gapOpenPenalty</param> /// <param name="gapExtendPenalty">negative gapExtendPenalty</param> /// <param name="numberOfPartitions">the number of partitions in dynamic programming</param> /// <param name="degreeOfParallelism">degree of parallelism option for parallel extension</param> public PAMSAMMultipleSequenceAligner( IList <ISequence> sequences, int kmerLength, DistanceFunctionTypes distanceFunctionName, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, ProfileAlignerNames profileAlignerMethodName, ProfileScoreFunctionNames profileFunctionName, SimilarityMatrix similarityMatrix, int gapOpenPenalty, int gapExtendPenalty, int numberOfPartitions, int degreeOfParallelism) { Performance.Start(); if (null == sequences) { throw new ArgumentNullException("sequences"); } if (sequences.Count == 0) { throw new ArgumentException("Empty input sequences"); } // Set parallel extension option if (degreeOfParallelism <= 0) { throw new ArgumentException("Invalid parallel degree parameter"); } //_degreeOfParallelism = degreeOfParallelism; parallelOption = new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism }; if (numberOfPartitions <= 0) { throw new ArgumentException("Invalid number of partition parameter"); } _numberOfPartitions = numberOfPartitions; // Assign the alphabet SetAlphabet(sequences, similarityMatrix, false); // Initialize parameters KmerLength = kmerLength; DistanceFunctionName = distanceFunctionName; HierarchicalClusteringMethodName = hierarchicalClusteringMethodName; ProfileAlignerName = profileAlignerMethodName; ProfileProfileFunctionName = profileFunctionName; SimilarityMatrix = similarityMatrix; GapOpenCost = gapOpenPenalty; GapExtensionCost = gapExtendPenalty; MsaUtils.SetProfileItemSets(_alphabet); Performance.Snapshot("Start Aligning"); // Work... DoAlignment(sequences); }
/// <summary> /// Validate Muscle multiple sequence alignment. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName">SW/NW profiler</param> /// <param name="profileScoreName">Profile score function name.</param> /// <param name="isWeightedProduct">True if it of the WeightedProduct type else false.</param> private void ValidatePamsamAlign(string nodeName, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName, bool isWeightedProduct) { Initialize(nodeName, expectedScoreNode); // MSA aligned sequences. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, 2, 2); int index = 0; foreach (ISequence seq in msa.AlignedSequences) { if (isWeightedProduct) { Assert.AreEqual(new string(seq.Select(a => (char) a).ToArray()), new string(expectedSequences[index].Select(a => (char) a).ToArray())); index++; } } Assert.IsTrue(expectedScore.Contains(msa.AlignmentScore.ToString((IFormatProvider) null))); }
public void TestMsaBenchMarkLargeDataset() { // Test on DNA benchmark dataset ISequenceParser parser = new FastaParser(); string filepath = @"testdata\FASTA\Protein\Balibase\RV913\BOX032.xml.afa"; IList <ISequence> orgSequences = parser.Parse(filepath); IList <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; String outputFilePath = @"tempBOX032.xml.afa"; StreamWriter writer = new StreamWriter(outputFilePath, true); foreach (ISequence sequence in sequences) { writer.WriteLine(">" + sequence.ID); // write sequence BasicDerivedSequence derivedSeq = new BasicDerivedSequence(sequence, false, false, 0, 0); for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60) { derivedSeq.RangeStart = lineStart; derivedSeq.RangeLength = Math.Min(60, sequence.Count - lineStart); writer.WriteLine(derivedSeq.ToString()); } writer.Flush(); } writer.Close(); sequences.Clear(); sequences = parser.Parse(outputFilePath); Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(sequences[i].ToString()); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(orgSequences[i].ToString()); } PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, MoleculeType.Protein, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(msa.AlignedSequencesA[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(msa.AlignedSequencesB[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(msa.AlignedSequencesC[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(msa.AlignedSequences[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); if (File.Exists(outputFilePath)) { File.Delete(outputFilePath); } }
/// <summary> /// Validate Muscle multiple sequence alignment with different profiler and /// profile score function name. /// </summary> /// <param name="nodeName">xml node name.</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="profileName">SW/NW profiler</param> /// <param name="profileScoreFunctionName">Profile score function name</param> /// <param name="isWeightedProduct">True if it of the WeightedProduct type else false.</param> private void ValidatePamsamAlignWithProfileScoreFunctionName(string nodeName, string expectedScoreNode, ProfileAlignerNames profileName, ProfileScoreFunctionNames profileScoreFunctionName, bool isWeightedProduct) { ValidatePamsamAlign(nodeName, expectedScoreNode, UpdateDistanceMethodsTypes.Average, DistanceFunctionTypes.EuclideanDistance, profileName, profileScoreFunctionName, isWeightedProduct); ApplicationLog.WriteLine(String.Format(null, @"PamsamBvtTest:: Pamsam alignment validation completed successfully with different profile score function name {0}", profileScoreFunctionName.ToString())); }
public void TestMuscleMultipleSequenceAlignmentRunningTime() { string filepath = @"TestUtils\FASTA\RunningTime\BOX246.xml.afa"; // Test on DNA benchmark dataset FastAParser parser = new FastAParser(filepath); IList <ISequence> orgSequences = parser.Parse().ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); //filepath = @"TestUtils\FASTA\RunningTime\12_raw.afa"; //List<ISequence> sequences = parser.Parse(filepath); int numberOfSequences = orgSequences.Count; Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(sequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(orgSequences[i].Select(a => (char)a).ToArray())); } PAMSAMMultipleSequenceAligner.FasterVersion = true; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("The number of partitions is: {0}", numberOfPartitions); Console.WriteLine("The number of degrees is: {0}", numberOfDegrees); Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesB[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); ((FastAParser)parser).Dispose(); }
public void TestMsaBenchMarkLargeDataset() { string filepath = @"\TestUtils\BOX032Small.xml.afa"; string filePathObj = Directory.GetCurrentDirectory() + filepath; // Test on DNA benchmark dataset FastAParser parser = new FastAParser(filePathObj); IList <ISequence> orgSequences = parser.Parse().ToList(); IList <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; String outputFilePath = @"tempBOX032.xml.afa"; using (StreamWriter writer = new StreamWriter(outputFilePath, true)) { foreach (ISequence sequence in sequences) { writer.WriteLine(">" + sequence.ID); // write sequence for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60) { writer.WriteLine(new String(sequence.Skip(lineStart).Take((int)Math.Min(60, sequence.Count - lineStart)).Select(a => (char)a).ToArray())); } writer.Flush(); } } sequences.Clear(); parser = new FastAParser(outputFilePath); sequences = parser.Parse().ToList(); Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(sequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(orgSequences[i].Select(a => (char)a).ToArray())); } PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesB[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); ((FastAParser)parser).Dispose(); if (File.Exists(outputFilePath)) { File.Delete(outputFilePath); } }
/// <summary> /// Constructor for all the pairwise aligner (NeedlemanWunsch, SmithWaterman, Overlap). /// Sets default similarity matrix and gap penalties. /// Users will typically reset these using parameters specific to their particular sequences and needs. /// /// This constructor is for non-parallel version. /// </summary> protected DynamicProgrammingProfileAlignerSerial( SimilarityMatrix similarityMatrix, ProfileScoreFunctionNames profileScoreFunctionName, int gapOpenPenalty, int gapExtensionPenalty) : this(similarityMatrix, profileScoreFunctionName, gapOpenPenalty, gapExtensionPenalty, 1) { }
/// <summary> /// Validate Stage 2 aligned sequences and score of Muscle multiple sequence alignment. /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="expectedScoreNode">Expected score node</param> /// <param name="hierarchicalClusteringMethodName">hierarchical clustering method name</param> /// <param name="distanceFunctionName">kmerdistancematrix method name.</param> /// <param name="profileAlignerName">SW/NW profiler</param> /// <param name="profileScoreName">Profile score function name.</param> private void ValidatePamsamAlignStage2(string nodeName, string expectedScoreNode, UpdateDistanceMethodsTypes hierarchicalClusteringMethodName, DistanceFunctionTypes distanceFunctionName, ProfileAlignerNames profileAlignerName, ProfileScoreFunctionNames profileScoreName) { Initialize(nodeName, expectedScoreNode); InitializeStage2Variables(nodeName); // MSA aligned sequences. var msa = new PAMSAMMultipleSequenceAligner(lstSequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileScoreName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, 2, 2); // Validate the aligned Sequence and score of stage2 if (null != msa.AlignedSequencesB) { Assert.AreEqual(stage2ExpectedSequences.Count, msa.AlignedSequencesB.Count); int index = 0; foreach (ISequence seq in msa.AlignedSequencesB) { Assert.AreEqual(new string(stage2ExpectedSequences[index].Select(a => (char) a).ToArray()), new string(seq.Select(a => (char) a).ToArray())); index++; } Assert.AreEqual(stage2ExpectedScore, msa.AlignmentScoreB.ToString((IFormatProvider) null)); } ApplicationLog.WriteLine(String.Format(null, "PamsamBvtTest:: Pamsam stage2 alignment completed successfully with all default params")); }