public void testBug3() { //Test on DNA benchmark dataset ISequenceParser parser = new FastaParser(); string filepath = @"TestUtils\122_raw.afa"; MoleculeType mt = MoleculeType.DNA; IList <ISequence> orgSequences = parser.Parse(filepath); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = null; switch (mt) { case (MoleculeType.DNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); break; case (MoleculeType.RNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); break; case (MoleculeType.Protein): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); break; default: throw new InvalidDataException("Invalid molecular type"); } //DateTime startTime = DateTime.Now; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Assert.IsNotNull(msa.AlignedSequences); ((FastaParser)parser).Dispose(); }
public void testBug2() { //Test on DNA benchmark dataset string filepath = @"TestUtils\122_raw.afa".TestDir(); FastAParser parser = new FastAParser(); IList <ISequence> orgSequences = parser.Parse(filepath).ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Assert.IsNotNull(msa.AlignedSequences); }
public void TestProgressiveAligner() { MsaUtils.SetProfileItemSets(MoleculeType.DNA); SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); int gapOpenPenalty = -8; int gapExtendPenalty = -1; int kmerLength = 4; PAMSAMMultipleSequenceAligner.parallelOption = new ParallelOptions { MaxDegreeOfParallelism = 2 }; ISequence seqA = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"); ISequence seqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"); ISequence seqC = new Sequence(Alphabets.DNA, "GGGACAAAATCAG"); List <ISequence> sequences = new List <ISequence>(); sequences.Add(seqA); sequences.Add(seqB); sequences.Add(seqC); KmerDistanceMatrixGenerator kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, kmerLength, MoleculeType.DNA); kmerDistanceMatrixGenerator.GenerateDistanceMatrix(sequences); IHierarchicalClustering hierarchicalClustering = new HierarchicalClusteringParallel(kmerDistanceMatrixGenerator.DistanceMatrix); BinaryGuideTree tree = new BinaryGuideTree(hierarchicalClustering); IProgressiveAligner progressiveAligner = new ProgressiveAligner(ProfileAlignerNames.NeedlemanWunschProfileAligner, similarityMatrix, gapOpenPenalty, gapExtendPenalty); progressiveAligner.Align(sequences, tree); ISequence expectedSeqA = new Sequence(Alphabets.DNA, "GGGA---AAAATCAGATT"); ISequence expectedSeqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG---"); ISequence expectedSeqC = new Sequence(Alphabets.DNA, "GGGA--CAAAATCAG---"); Assert.AreEqual(expectedSeqA.ToString(), progressiveAligner.AlignedSequences[0].ToString()); Assert.AreEqual(expectedSeqB.ToString(), progressiveAligner.AlignedSequences[1].ToString()); Assert.AreEqual(expectedSeqC.ToString(), progressiveAligner.AlignedSequences[2].ToString()); sequences = new List <ISequence>(); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAATCG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCTTATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); kmerDistanceMatrixGenerator.GenerateDistanceMatrix(sequences); hierarchicalClustering = new HierarchicalClusteringParallel(kmerDistanceMatrixGenerator.DistanceMatrix); tree = new BinaryGuideTree(hierarchicalClustering); for (int i = 0; i < tree.NumberOfNodes; ++i) { Console.WriteLine("Node {0} ID: {1}", i, tree.Nodes[i].ID); } for (int i = 0; i < tree.NumberOfEdges; ++i) { Console.WriteLine("Edge {0} ID: {1}, length: {2}", i, tree.Edges[i].ID, tree.Edges[i].Length); } SequenceWeighting sw = new SequenceWeighting(tree); for (int i = 0; i < sw.Weights.Length; ++i) { Console.WriteLine("weights {0} is {1}", i, sw.Weights[i]); } progressiveAligner = new ProgressiveAligner(ProfileAlignerNames.NeedlemanWunschProfileAligner, similarityMatrix, gapOpenPenalty, gapExtendPenalty); progressiveAligner.Align(sequences, tree); for (int i = 0; i < progressiveAligner.AlignedSequences.Count; ++i) { Console.WriteLine(progressiveAligner.AlignedSequences[i].ToString()); } MsaUtils.SetProfileItemSets(MoleculeType.Protein); ISequenceParser parser = new FastaParser(); string filepath = @"testdata\FASTA\Protein\BB11001.tfa"; IList <ISequence> orgSequences = parser.Parse(filepath); sequences = MsaUtils.UnAlign(orgSequences); similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); kmerLength = 4; int numberOfSequences = orgSequences.Count; gapOpenPenalty = -13; gapExtendPenalty = -5; kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, kmerLength, MoleculeType.DNA); kmerDistanceMatrixGenerator.GenerateDistanceMatrix(sequences); hierarchicalClustering = new HierarchicalClusteringParallel(kmerDistanceMatrixGenerator.DistanceMatrix); tree = new BinaryGuideTree(hierarchicalClustering); for (int i = tree.NumberOfLeaves; i < tree.Nodes.Count; ++i) { Console.WriteLine("Node {0}: leftchildren-{1}, rightChildren-{2}", i, tree.Nodes[i].LeftChildren.ID, tree.Nodes[i].RightChildren.ID); } progressiveAligner = new ProgressiveAligner(ProfileAlignerNames.NeedlemanWunschProfileAligner, similarityMatrix, gapOpenPenalty, gapExtendPenalty); progressiveAligner.Align(sequences, tree); for (int i = 0; i < progressiveAligner.AlignedSequences.Count; ++i) { Console.WriteLine(progressiveAligner.AlignedSequences[i].ToString()); } }
public void TestMuscleMultipleSequenceAlignmentRunningTime() { string filepath = @"TestUtils\FASTA\RunningTime\BOX246.xml.afa"; // Test on DNA benchmark dataset FastAParser parser = new FastAParser(filepath); IList <ISequence> orgSequences = parser.Parse().ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); //filepath = @"TestUtils\FASTA\RunningTime\12_raw.afa"; //List<ISequence> sequences = parser.Parse(filepath); int numberOfSequences = orgSequences.Count; Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(sequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(orgSequences[i].Select(a => (char)a).ToArray())); } PAMSAMMultipleSequenceAligner.FasterVersion = true; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("The number of partitions is: {0}", numberOfPartitions); Console.WriteLine("The number of degrees is: {0}", numberOfDegrees); Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesB[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); ((FastAParser)parser).Dispose(); }
public void TestMsaBenchMarkOnBralibase() { List <float> allQ = new List <float>(); List <float> allTC = new List <float>(); string fileDirectory = @"testData\FASTA\RNA\k10"; DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; MoleculeType mt = MoleculeType.RNA; SimilarityMatrix similarityMatrix; int gapOpenPenalty = -20; int gapExtendPenalty = -5; int kmerLength = 4; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached; switch (mt) { case (MoleculeType.DNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); break; case (MoleculeType.RNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); break; case (MoleculeType.Protein): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); break; default: throw new Exception("Invalid molecular type"); } foreach (DirectoryInfo fi in iD.GetDirectories()) { foreach (FileInfo fiii in fi.GetFiles()) { String filePath = fiii.FullName; Console.WriteLine(filePath); ISequenceParser parser = new FastaParser(); IList <ISequence> orgSequences = parser.Parse(filePath); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); Console.WriteLine("Original unaligned sequences are:"); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { //Console.WriteLine(msa.AlignedSequences[i].ToString()); } float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); allQ.Add(scoreQ); allTC.Add(scoreTC); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); if (allQ.Count % 1000 == 0) { Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); } } } Console.WriteLine("number of datasets is: {0}", allQ.Count); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMsaBenchMarkLargeDataset() { // Test on DNA benchmark dataset ISequenceParser parser = new FastaParser(); string filepath = @"testdata\FASTA\Protein\Balibase\RV913\BOX032.xml.afa"; IList <ISequence> orgSequences = parser.Parse(filepath); IList <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; String outputFilePath = @"tempBOX032.xml.afa"; StreamWriter writer = new StreamWriter(outputFilePath, true); foreach (ISequence sequence in sequences) { writer.WriteLine(">" + sequence.ID); // write sequence BasicDerivedSequence derivedSeq = new BasicDerivedSequence(sequence, false, false, 0, 0); for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60) { derivedSeq.RangeStart = lineStart; derivedSeq.RangeLength = Math.Min(60, sequence.Count - lineStart); writer.WriteLine(derivedSeq.ToString()); } writer.Flush(); } writer.Close(); sequences.Clear(); sequences = parser.Parse(outputFilePath); Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(sequences[i].ToString()); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(orgSequences[i].ToString()); } PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, MoleculeType.Protein, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(msa.AlignedSequencesA[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(msa.AlignedSequencesB[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(msa.AlignedSequencesC[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(msa.AlignedSequences[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); if (File.Exists(outputFilePath)) { File.Delete(outputFilePath); } }
public void PerformPAMSAMPerf() { Stopwatch _watchObj = new Stopwatch(); // Get input values from XML. string refPath = Utility._xmlUtil.GetTextValue(Constants.PamsamNode, Constants.RefFilePathNode); string queryPath = Utility._xmlUtil.GetTextValue(Constants.PamsamNode, Constants.QueryFilePathNode); // Create a List for input files. List <string> lstInputFiles = new List <string>(); lstInputFiles.Add(refPath); lstInputFiles.Add(queryPath); // Parse a Reference and query sequence file. ISequenceParser parser = new FastaParser(); IList <ISequence> refsequences = parser.Parse(queryPath); IList <ISequence> orgSequences = parser.Parse(refPath); // Execute UnAlign method to verify that it does not contains gap List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); // Set static properties PAMSAMMultipleSequenceAligner.FasterVersion = true; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; // Set Alignment parameters. int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; int numberOfPartitions = 4; // Profile Distance function name DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; // Set Hierarchical clustering. UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; // Set NeedlemanWunschProfileAligner ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProduct; // Create similarity matrix instance. SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); // Reset stop watch and start timer. _watchObj.Reset(); _watchObj.Start(); long memoryStart = GC.GetTotalMemory(true); // Parallel Option will only get set if the PAMSAMMultipleSequenceAligner is getting called // To test separately distance matrix, binary tree etc.. // Set the parallel option using below ctor. msa = new PAMSAMMultipleSequenceAligner (sequences, MoleculeType.DNA, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); // Stop watchclock. _watchObj.Stop(); long memoryEnd = GC.GetTotalMemory(true); string memoryUsed = (memoryEnd - memoryStart).ToString(); // Display all aligned sequence, performance and memory optimization nos. DisplayTestCaseHeader(lstInputFiles, _watchObj, memoryUsed, "PAMSAM"); Console.WriteLine(string.Format( "PAMSAM SequenceAligner method, Alignment Score is : {0}", msa.AlignmentScore.ToString())); int index = 0; foreach (ISequence seq in msa.AlignedSequences) { Console.WriteLine(string.Format( "PAMSAM Aligned Seq {0}:{1}", index, seq.ToString())); index++; } }
public void TestHierarchicalClusteringSerial() { int dimension = 4; IDistanceMatrix distanceMatrix = new SymmetricDistanceMatrix(dimension); for (int i = 0; i < distanceMatrix.Dimension - 1; ++i) { for (int j = i + 1; j < distanceMatrix.Dimension; ++j) { distanceMatrix[i, j] = i + j; distanceMatrix[j, i] = i + j; } } PAMSAMMultipleSequenceAligner.ParallelOption = new ParallelOptions { MaxDegreeOfParallelism = 2 }; IHierarchicalClustering hierarchicalClustering = new HierarchicalClusteringParallel(distanceMatrix); Assert.AreEqual(7, hierarchicalClustering.Nodes.Count); for (int i = 0; i < dimension * 2 - 1; ++i) { Assert.AreEqual(i, hierarchicalClustering.Nodes[i].ID); } for (int i = dimension; i < hierarchicalClustering.Nodes.Count; ++i) { Console.WriteLine(hierarchicalClustering.Nodes[i].LeftChildren.ID); Console.WriteLine(hierarchicalClustering.Nodes[i].RightChildren.ID); } // Test on sequences ISequence seqA = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"); ISequence seqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"); ISequence seqC = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"); List <ISequence> sequences = new List <ISequence>(); sequences.Add(seqA); sequences.Add(seqB); sequences.Add(seqC); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAATCG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCTTATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT")); sequences.Add(new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG")); sequences.Add(new Sequence(Alphabets.DNA, "GGGACAAAATCAG")); int kmerLength = 4; KmerDistanceMatrixGenerator kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, kmerLength, Alphabets.AmbiguousDNA); //Console.WriteLine(kmerDistanceMatrixGenerator.Name); kmerDistanceMatrixGenerator.GenerateDistanceMatrix(sequences); //Console.WriteLine(kmerDistanceMatrixGenerator.DistanceMatrix); for (int i = 0; i < kmerDistanceMatrixGenerator.DistanceMatrix.Dimension - 1; ++i) { for (int j = i + 1; j < kmerDistanceMatrixGenerator.DistanceMatrix.Dimension; ++j) { Console.WriteLine("{0}-{1}: {2}", i, j, kmerDistanceMatrixGenerator.DistanceMatrix[i, j]); } } hierarchicalClustering = new HierarchicalClusteringParallel(kmerDistanceMatrixGenerator.DistanceMatrix); for (int i = 0; i < hierarchicalClustering.Nodes.Count; ++i) { Assert.AreEqual(true, hierarchicalClustering.Nodes[i].NeedReAlignment); } BinaryGuideTree tree = new BinaryGuideTree(hierarchicalClustering); for (int i = 0; i < tree.Nodes.Count; ++i) { Assert.AreEqual(true, tree.Nodes[i].NeedReAlignment); } // SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); //Assert.AreEqual(0, hierarchicalClustering.Nodes[4].LeftChildren.ID); //Assert.AreEqual(1, hierarchicalClustering.Nodes[4].RightChildren.ID); //Assert.AreEqual(2, hierarchicalClustering.Nodes[5].LeftChildren.ID); //Assert.AreEqual(4, hierarchicalClustering.Nodes[5].RightChildren.ID); //Assert.AreEqual(3, hierarchicalClustering.Nodes[6].LeftChildren.ID); //Assert.AreEqual(5, hierarchicalClustering.Nodes[6].RightChildren.ID); // Test on larger dataset string filepath = @"TestUtils\Fasta\RV11_BBS_all.afa".TestDir(); FastAParser parser = new FastAParser(); IList <ISequence> orgSequences = parser.Parse(filepath).ToList(); sequences = MsaUtils.UnAlign(orgSequences); kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, kmerLength, Alphabets.AmbiguousDNA); kmerDistanceMatrixGenerator.GenerateDistanceMatrix(sequences); hierarchicalClustering = new HierarchicalClusteringParallel(kmerDistanceMatrixGenerator.DistanceMatrix); for (int i = sequences.Count; i < hierarchicalClustering.Nodes.Count; ++i) { Console.WriteLine("Node {0}: leftchildren-{1}, rightChildren-{2}", i, hierarchicalClustering.Nodes[i].LeftChildren.ID, hierarchicalClustering.Nodes[i].RightChildren.ID); } }
public void TestNeedlemanWunschProfileAligner() { Console.WriteLine("Number of logical processors: {0}", Environment.ProcessorCount); ISequence templateSequence = new Sequence(Alphabets.AmbiguousDNA, "ATGCSWRYKMBVHDN-"); Dictionary <byte, int> itemSet = new Dictionary <byte, int>(); for (int i = 0; i < templateSequence.Count; ++i) { itemSet.Add(templateSequence[i], i); if (char.IsLetter((char)templateSequence[i])) { itemSet.Add((byte)char.ToLower((char)templateSequence[i]), i); } } Profiles.ItemSet = itemSet; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); int gapOpenPenalty = -3; int gapExtendPenalty = -1; IProfileAligner profileAligner = new NeedlemanWunschProfileAlignerSerial(similarityMatrix, ProfileScoreFunctionNames.WeightedInnerProduct, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount); ISequence seqA = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"); ISequence seqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"); List <ISequence> sequences = new List <ISequence>(); sequences.Add(seqA); sequences.Add(seqB); IProfileAlignment profileAlignmentA = ProfileAlignment.GenerateProfileAlignment(sequences[0]); IProfileAlignment profileAlignmentB = ProfileAlignment.GenerateProfileAlignment(sequences[1]); profileAligner.Align(profileAlignmentA, profileAlignmentB); List <int> eStringSubtree = profileAligner.GenerateEString(profileAligner.AlignedA); List <int> eStringSubtreeB = profileAligner.GenerateEString(profileAligner.AlignedB); List <ISequence> alignedSequences = new List <ISequence>(); ISequence seq = profileAligner.GenerateSequenceFromEString(eStringSubtree, sequences[0]); alignedSequences.Add(seq); seq = profileAligner.GenerateSequenceFromEString(eStringSubtreeB, sequences[1]); alignedSequences.Add(seq); float profileScore = MsaUtils.MultipleAlignmentScoreFunction(alignedSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty); Console.WriteLine("alignment score is: {0}", profileScore); Console.WriteLine("the aligned sequences are:"); for (int i = 0; i < alignedSequences.Count; ++i) { Console.WriteLine(new string(alignedSequences[i].Select(a => (char)a).ToArray())); } // Test on case 3: 36 sequences string filepath = @"\TestUtils\RV11_BBS_allSmall.afa"; string filePathObj = Directory.GetCurrentDirectory() + filepath; FastAParser parser = new FastAParser(filePathObj); IList <ISequence> orgSequences = parser.Parse().ToList(); sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("Original unaligned sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(">"); Console.WriteLine(new string(sequences[i].Select(a => (char)a).ToArray())); } for (int i = 1; i < numberOfSequences - 1; ++i) { for (int j = i + 1; j < numberOfSequences; ++j) { profileAlignmentA = ProfileAlignment.GenerateProfileAlignment(sequences[i]); profileAlignmentB = ProfileAlignment.GenerateProfileAlignment(sequences[j]); profileAligner = new NeedlemanWunschProfileAlignerSerial(similarityMatrix, ProfileScoreFunctionNames.WeightedInnerProduct, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount); profileAligner.Align(profileAlignmentA, profileAlignmentB); eStringSubtree = profileAligner.GenerateEString(profileAligner.AlignedA); eStringSubtreeB = profileAligner.GenerateEString(profileAligner.AlignedB); Console.WriteLine("Sequences lengths are: {0}-{1}", sequences[i].Count, sequences[j].Count); Console.WriteLine("estring 1:"); for (int k = 0; k < eStringSubtree.Count; ++k) { Console.Write("{0}\t", eStringSubtree[k]); } Console.WriteLine("\nestring 2:"); for (int k = 0; k < eStringSubtreeB.Count; ++k) { Console.Write("{0}\t", eStringSubtreeB[k]); } alignedSequences = new List <ISequence>(); seq = profileAligner.GenerateSequenceFromEString(eStringSubtree, sequences[i]); alignedSequences.Add(seq); seq = profileAligner.GenerateSequenceFromEString(eStringSubtreeB, sequences[j]); alignedSequences.Add(seq); profileScore = MsaUtils.MultipleAlignmentScoreFunction(alignedSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty); Console.WriteLine("\nalignment score is: {0}", profileScore); Console.WriteLine("the aligned sequences are:"); for (int k = 0; k < alignedSequences.Count; ++k) { Console.WriteLine(new string(alignedSequences[k].Select(a => (char)a).ToArray())); } } ((FastAParser)parser).Dispose(); } }
public void TestMsaBenchMark() { string fileDirectory = @"TestUtils\FASTA\Protein\Balibase\RV911\"; DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; SimilarityMatrix similarityMatrix; int gapOpenPenalty = -20; int gapExtendPenalty = -5; int kmerLength = 4; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached; similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); List <float> allQ = new List <float>(); List <float> allTC = new List <float>(); foreach (FileInfo fi in iD.GetFiles()) { String filePath = fi.FullName; Console.WriteLine(filePath); FastAParser parser = new FastAParser(filePath); parser.Alphabet = AmbiguousProteinAlphabet.Instance; IList <ISequence> orgSequences = parser.Parse().ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); Console.WriteLine("Original unaligned sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { //Console.WriteLine(sequences[i].ToString()); } Console.WriteLine("Original aligned sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { //Console.WriteLine(orgSequences[i].ToString()); } PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { //Console.WriteLine(msa.AlignedSequencesA[i].ToString()); } Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { //Console.WriteLine(msa.AlignedSequencesB[i].ToString()); } Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { //Console.WriteLine(msa.AlignedSequencesC[i].ToString()); } Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { //Console.WriteLine(msa.AlignedSequences[i].ToString()); } float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); allQ.Add(scoreQ); allTC.Add(scoreTC); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); ((FastAParser)parser).Dispose(); } Console.WriteLine("Number of datasets is: {0}", allQ.Count); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMsaBenchMarkLargeDataset() { string filepath = @"\TestUtils\BOX032Small.xml.afa"; string filePathObj = Directory.GetCurrentDirectory() + filepath; // Test on DNA benchmark dataset FastAParser parser = new FastAParser(filePathObj); IList <ISequence> orgSequences = parser.Parse().ToList(); IList <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; String outputFilePath = @"tempBOX032.xml.afa"; using (StreamWriter writer = new StreamWriter(outputFilePath, true)) { foreach (ISequence sequence in sequences) { writer.WriteLine(">" + sequence.ID); // write sequence for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60) { writer.WriteLine(new String(sequence.Skip(lineStart).Take((int)Math.Min(60, sequence.Count - lineStart)).Select(a => (char)a).ToArray())); } writer.Flush(); } } sequences.Clear(); parser = new FastAParser(outputFilePath); sequences = parser.Parse().ToList(); Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(sequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(new string(orgSequences[i].Select(a => (char)a).ToArray())); } PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesA[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesB[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequencesC[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(new string(msa.AlignedSequences[i].Select(a => (char)a).ToArray())); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); ((FastAParser)parser).Dispose(); if (File.Exists(outputFilePath)) { File.Delete(outputFilePath); } }
public void TestMuscleMultipleSequenceAlignmentRunningTime() { // Test on DNA benchmark dataset ISequenceParser parser = new FastaParser(); //string filepath = @"testdata\FASTA\RunningTime\122.afa"; string filepath = @"testdata\FASTA\RunningTime\BOX246.xml.afa"; MoleculeType mt = MoleculeType.Protein; IList <ISequence> orgSequences = parser.Parse(filepath); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); //filepath = @"testdata\FASTA\RunningTime\12_raw.afa"; //List<ISequence> sequences = parser.Parse(filepath); int numberOfSequences = orgSequences.Count; Console.WriteLine("Original sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(sequences[i].ToString()); } Console.WriteLine("Benchmark sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(orgSequences[i].ToString()); } PAMSAMMultipleSequenceAligner.FasterVersion = true; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 2; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.InnerProductFast; SimilarityMatrix similarityMatrix = null; switch (mt) { case (MoleculeType.DNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); break; case (MoleculeType.RNA): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); break; case (MoleculeType.Protein): similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); break; default: throw new Exception("Invalid molecular type"); } PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, mt, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("The number of partitions is: {0}", numberOfPartitions); Console.WriteLine("The number of degrees is: {0}", numberOfDegrees); Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(msa.AlignedSequencesA[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(msa.AlignedSequencesB[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(msa.AlignedSequencesC[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(msa.AlignedSequences[i].ToString()); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); }
public void TestNeedlemanWunschProfileAligner() { Console.WriteLine("Number of logical processors: {0}", Environment.ProcessorCount); ISequence templateSequence = new Sequence(Alphabets.DNA, "ATGCSWRYKMBVHDN-"); Dictionary <ISequenceItem, int> itemSet = new Dictionary <ISequenceItem, int>(); for (int i = 0; i < templateSequence.Count; ++i) { itemSet.Add(templateSequence[i], i); } Profiles.ItemSet = itemSet; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); int gapOpenPenalty = -3; int gapExtendPenalty = -1; IProfileAligner profileAligner = new NeedlemanWunschProfileAlignerSerial(similarityMatrix, ProfileScoreFunctionNames.WeightedInnerProduct, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount); ISequence seqA = new Sequence(Alphabets.DNA, "GGGAAAAATCAGATT"); ISequence seqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG"); List <ISequence> sequences = new List <ISequence>(); sequences.Add(seqA); sequences.Add(seqB); IProfileAlignment profileAlignmentA = ProfileAlignment.GenerateProfileAlignment(sequences[0]); IProfileAlignment profileAlignmentB = ProfileAlignment.GenerateProfileAlignment(sequences[1]); profileAligner.Align(profileAlignmentA, profileAlignmentB); List <int> eStringSubtree = profileAligner.GenerateEString(profileAligner.AlignedA); List <int> eStringSubtreeB = profileAligner.GenerateEString(profileAligner.AlignedB); List <ISequence> alignedSequences = new List <ISequence>(); ISequence seq = profileAligner.GenerateSequenceFromEString(eStringSubtree, sequences[0]); alignedSequences.Add(seq); seq = profileAligner.GenerateSequenceFromEString(eStringSubtreeB, sequences[1]); alignedSequences.Add(seq); float profileScore = MsaUtils.MultipleAlignmentScoreFunction(alignedSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty); Console.WriteLine("alignment score is: {0}", profileScore); Console.WriteLine("the aligned sequences are:"); for (int i = 0; i < alignedSequences.Count; ++i) { Console.WriteLine(alignedSequences[i].ToString()); } ISequence expectedSeqA = new Sequence(Alphabets.DNA, "GGGAA---AAATCAGATT"); ISequence expectedSeqB = new Sequence(Alphabets.DNA, "GGGAATCAAAATCAG---"); // Test on case 3: 36 sequences ISequenceParser parser = new FastaParser(); string filepath = @"testdata\FASTA\RV11_BBS_all.afa"; IList <ISequence> orgSequences = parser.Parse(filepath); sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("Original unaligned sequences are:"); for (int i = 0; i < numberOfSequences; ++i) { Console.WriteLine(">"); Console.WriteLine(sequences[i].ToString()); } for (int i = 1; i < numberOfSequences - 1; ++i) { for (int j = i + 1; j < numberOfSequences; ++j) { profileAlignmentA = ProfileAlignment.GenerateProfileAlignment(sequences[i]); profileAlignmentB = ProfileAlignment.GenerateProfileAlignment(sequences[j]); profileAligner = new NeedlemanWunschProfileAlignerSerial(similarityMatrix, ProfileScoreFunctionNames.WeightedInnerProduct, gapOpenPenalty, gapExtendPenalty, Environment.ProcessorCount); profileAligner.Align(profileAlignmentA, profileAlignmentB); eStringSubtree = profileAligner.GenerateEString(profileAligner.AlignedA); eStringSubtreeB = profileAligner.GenerateEString(profileAligner.AlignedB); Console.WriteLine("Sequences lengths are: {0}-{1}", sequences[i].Count, sequences[j].Count); Console.WriteLine("estring 1:"); for (int k = 0; k < eStringSubtree.Count; ++k) { Console.Write("{0}\t", eStringSubtree[k]); } Console.WriteLine("\nestring 2:"); for (int k = 0; k < eStringSubtreeB.Count; ++k) { Console.Write("{0}\t", eStringSubtreeB[k]); } alignedSequences = new List <ISequence>(); seq = profileAligner.GenerateSequenceFromEString(eStringSubtree, sequences[i]); alignedSequences.Add(seq); seq = profileAligner.GenerateSequenceFromEString(eStringSubtreeB, sequences[j]); alignedSequences.Add(seq); profileScore = MsaUtils.MultipleAlignmentScoreFunction(alignedSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty); Console.WriteLine("\nalignment score is: {0}", profileScore); Console.WriteLine("the aligned sequences are:"); for (int k = 0; k < alignedSequences.Count; ++k) { Console.WriteLine(alignedSequences[k].ToString()); } } } }
public void TestMsaBenchMarkOnBralibase() { var allQ = new List <float>(); var allTC = new List <float>(); string fileDirectory = @"TestUtils\Fasta\RNA\k10".TestDir(); DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = false; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; var similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna);; int gapOpenPenalty = -20; int gapExtendPenalty = -5; int kmerLength = 4; int numberOfDegrees = 2; int numberOfPartitions = 16; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProductCached; foreach (DirectoryInfo fi in iD.GetDirectories()) { foreach (FileInfo fiii in fi.GetFiles()) { String filePath = fiii.FullName; Console.WriteLine($"Loading: {filePath}"); var orgSequences = new FastAParser() { Alphabet = AmbiguousRnaAlphabet.Instance }.Parse(filePath).ToList(); var sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); allQ.Add(scoreQ); allTC.Add(scoreTC); if (allQ.Count % 1000 == 0) { Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); } } } Console.WriteLine("number of datasets is: {0}", allQ.Count); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMsaBenchMarkOnSABmark() { List <float> allQ = new List <float>(); List <float> allTC = new List <float>(); string fileDirectory = @"TestUtils\Fasta\Protein\SABmark".TestDir(); DirectoryInfo iD = new DirectoryInfo(fileDirectory); PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; SimilarityMatrix similarityMatrix; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; //Environment.ProcessorCount; int numberOfPartitions = 16; // Environment.ProcessorCount * 2; DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); foreach (DirectoryInfo fi in iD.GetDirectories()) { foreach (DirectoryInfo fii in fi.GetDirectories()) { foreach (FileInfo fiii in fii.GetFiles()) { String filePath = fiii.FullName; Console.WriteLine(filePath); FastAParser parser = new FastAParser(); IList <ISequence> orgSequences = parser.Parse(filePath).ToList(); List <ISequence> sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Console.WriteLine("The number of sequences is: {0}", numberOfSequences); Console.WriteLine("Original unaligned sequences are:"); PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { //Console.WriteLine(msa.AlignedSequences[i].ToString()); } float scoreQ = MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences); float scoreTC = MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences); allQ.Add(scoreQ); allTC.Add(scoreTC); Console.WriteLine("Alignment score Q is: {0}", scoreQ); Console.WriteLine("Alignment score TC is: {0}", scoreTC); if (allQ.Count % 1000 == 0) { Console.WriteLine(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"); Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); } } } } Console.WriteLine("average Q score is: {0}", MsaUtils.Mean(allQ.ToArray())); Console.WriteLine("average TC score is: {0}", MsaUtils.Mean(allTC.ToArray())); }
public void TestMsaBenchMarkLargeDataset() { // Test on DNA benchmark dataset string filePathObj = @"TestUtils\BOX032Small.xml.afa".TestDir(); var orgSequences = new FastAParser().Parse(filePathObj).ToList(); var sequences = MsaUtils.UnAlign(orgSequences); int numberOfSequences = orgSequences.Count; Assert.AreEqual(numberOfSequences, sequences.Count); string outputFilePath = Path.GetTempFileName(); try { using (StreamWriter writer = new StreamWriter(outputFilePath, true)) { foreach (ISequence sequence in sequences) { // write sequence writer.WriteLine(">" + sequence.ID); for (int lineStart = 0; lineStart < sequence.Count; lineStart += 60) { writer.WriteLine(new String(sequence.Skip(lineStart).Take((int)Math.Min(60, sequence.Count - lineStart)).Select(a => (char)a).ToArray())); } writer.Flush(); } } sequences = new FastAParser().Parse(outputFilePath).ToList(); } finally { File.Delete(outputFilePath); } Console.WriteLine("Original sequences are:"); sequences.ForEach(Console.WriteLine); Console.WriteLine("Benchmark sequences are:"); orgSequences.ForEach(Console.WriteLine); // Begin alignment PAMSAMMultipleSequenceAligner.FasterVersion = false; PAMSAMMultipleSequenceAligner.UseWeights = false; PAMSAMMultipleSequenceAligner.UseStageB = true; PAMSAMMultipleSequenceAligner.NumberOfCores = 2; int gapOpenPenalty = -13; int gapExtendPenalty = -5; int kmerLength = 3; int numberOfDegrees = 2; int numberOfPartitions = 16; SimilarityMatrix similarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum62); DistanceFunctionTypes distanceFunctionName = DistanceFunctionTypes.EuclideanDistance; UpdateDistanceMethodsTypes hierarchicalClusteringMethodName = UpdateDistanceMethodsTypes.Average; ProfileAlignerNames profileAlignerName = ProfileAlignerNames.NeedlemanWunschProfileAligner; ProfileScoreFunctionNames profileProfileFunctionName = ProfileScoreFunctionNames.WeightedInnerProduct; PAMSAMMultipleSequenceAligner msa = new PAMSAMMultipleSequenceAligner (sequences, kmerLength, distanceFunctionName, hierarchicalClusteringMethodName, profileAlignerName, profileProfileFunctionName, similarityMatrix, gapOpenPenalty, gapExtendPenalty, numberOfPartitions, numberOfDegrees); Console.WriteLine("Benchmark SPS score is: {0}", MsaUtils.MultipleAlignmentScoreFunction(orgSequences, similarityMatrix, gapOpenPenalty, gapExtendPenalty)); Console.WriteLine("Aligned sequences in stage 1: {0}", msa.AlignmentScoreA); for (int i = 0; i < msa.AlignedSequencesA.Count; ++i) { Console.WriteLine(msa.AlignedSequencesA[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesA, orgSequences)); Console.WriteLine("Aligned sequences in stage 2: {0}", msa.AlignmentScoreB); for (int i = 0; i < msa.AlignedSequencesB.Count; ++i) { Console.WriteLine(msa.AlignedSequencesB[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesB, orgSequences)); Console.WriteLine("Aligned sequences in stage 3: {0}", msa.AlignmentScoreC); for (int i = 0; i < msa.AlignedSequencesC.Count; ++i) { Console.WriteLine(msa.AlignedSequencesC[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequencesC, orgSequences)); Console.WriteLine("Aligned sequences final: {0}", msa.AlignmentScore); for (int i = 0; i < msa.AlignedSequences.Count; ++i) { Console.WriteLine(msa.AlignedSequences[i]); } Console.WriteLine("Alignment score Q is: {0}", MsaUtils.CalculateAlignmentScoreQ(msa.AlignedSequences, orgSequences)); Console.WriteLine("Alignment score TC is: {0}", MsaUtils.CalculateAlignmentScoreTC(msa.AlignedSequences, orgSequences)); }