/// <summary> /// Build Suffix Tree using reference sequence /// </summary> /// <param name="referenceSequence">sequence to build SuffixTree</param> /// <returns>Suffix Tree</returns> protected override ISuffixTree BuildSuffixTree(ISequence referenceSequence) { ISuffixTreeBuilder suffixTreeBuilder = Factory.CreateNew(referenceSequence); ISuffixTree suffixTree = suffixTreeBuilder.BuildSuffixTree(referenceSequence); return(suffixTree); }
/// <summary> /// Creates the suffix array given an already built instance of a suffix tree. /// </summary> /// <param name="suffixTree">The suffixTree.</param> /// <returns></returns> public static int[] Create(ISuffixTree suffixTree) { if (suffixTree == null) { return null; } var stack = new Stack<ISuffixNode>(); stack.Push(suffixTree.Root); int textLength = suffixTree.Text.Length; var sufarray = new int[textLength]; int k = 0; while (stack.Count > 0) { var node = stack.Pop(); if (node.IsLeaf) { sufarray[k++] = node.LeafNumber; } else { foreach (var kvp in node.Children.Reverse()) { stack.Push(kvp.Value); } } } return sufarray; }
public static string GetNodeSuffix(this ISuffixTree t, ISuffixNode p) { var sb = new StringBuilder(); GetNodeSuffixImpl(t, p, sb); return(sb.ToString()); }
/// <summary> /// Creates the suffix array given an already built instance of a suffix tree. /// </summary> /// <param name="suffixTree">The suffixTree.</param> /// <returns></returns> public static int[] Create(ISuffixTree suffixTree) { if (suffixTree == null) { return(null); } var stack = new Stack <ISuffixNode>(); stack.Push(suffixTree.Root); int textLength = suffixTree.Text.Length; var sufarray = new int[textLength]; int k = 0; while (stack.Count > 0) { var node = stack.Pop(); if (node.IsLeaf) { sufarray[k++] = node.LeafNumber; } else { foreach (var kvp in node.Children.Reverse()) { stack.Push(kvp.Value); } } } return(sufarray); }
public void TestStreamingPersistentSegmentedSequence() { string sequenceString = "AAATTGGC"; Sequence sequence = new Sequence(Alphabets.Protein, sequenceString); SegmentedSequence segmentedSequece = new SegmentedSequence(sequence); sequenceString = "ANANA"; sequence = new Sequence(Alphabets.Protein, sequenceString); segmentedSequece.Sequences.Add(sequence); using (SimpleSuffixTreeBuilder simpleSuffixTreeBuilder = new SimpleSuffixTreeBuilder()) { simpleSuffixTreeBuilder.PersistenceThreshold = 0; ISuffixTree persistentSuffixTree = simpleSuffixTreeBuilder.BuildSuffixTree(segmentedSequece); string queryString = "AATTNANAGGC"; Sequence querySequence = new Sequence(Alphabets.Protein, queryString); IList <MaxUniqueMatch> MUMs = simpleSuffixTreeBuilder.FindMatches(persistentSuffixTree, querySequence, 3); // Verify the count of MUMs found Assert.AreEqual(3, MUMs.Count); } }
/// <summary> /// Finds all the matches of given sequence in suffix tree irrespective of the uniqueness in /// reference or query sequence /// </summary> /// <param name="suffixTree">Suffix Tree</param> /// <param name="searchSequence">Query searchSequence</param> /// <param name="lengthOfMUM">Mininum length of MUM</param> /// <returns>Matches found</returns> public IList <MaxUniqueMatch> FindMaximumMatches( ISuffixTree suffixTree, ISequence searchSequence, long lengthOfMUM) { _findMaximumMatch = true; return(FindMatchWithOption(suffixTree, searchSequence, lengthOfMUM)); }
/// <summary> /// Generates list of MUMs for each query sequence. /// This returns the MUMs that are generated. /// If 'performLIS' is true, MUMs are sorted and processed /// using Longest Increasing Subsequence (LIS). If 'performLIS' /// is false, MUMs are returned immediately after streaming. /// Note: If MaximumMatchEnabled property is true, then MUMs are generated irrespective /// of uniqueness in query and reference sequences; else MUMs are unique in reference /// sequence only. /// </summary> /// <param name="referenceSequence">Reference sequence</param> /// <param name="querySequenceList">List of query sequences</param> /// <param name="performLIS">Boolean indicating whether Longest Increasing /// Subsequence (LIS) modules is run on MUMs before returning</param> /// <returns>List of MUMs for each query sequence</returns> public override IDictionary <ISequence, IList <MaxUniqueMatch> > GetMUMs( ISequence referenceSequence, IList <ISequence> querySequenceList, bool performLIS) { GetMUMsValidate(referenceSequence, querySequenceList); // Initializations IDictionary <ISequence, IList <MaxUniqueMatch> > queryMums = new Dictionary <ISequence, IList <MaxUniqueMatch> >(); // Step1 : building suffix trees using reference sequence ISuffixTree suffixTree = BuildSuffixTree(referenceSequence); // On each query sequence aligned with reference sequence //foreach (ISequence sequence in querySequenceList) Parallel.ForEach(querySequenceList, sequence => { bool isQuerySequence = true; IList <MaxUniqueMatch> mumList; if (sequence.Equals(referenceSequence)) { isQuerySequence = false; } if (isQuerySequence) { // Step2 : streaming process is performed with the query sequence mumList = Streaming(suffixTree, referenceSequence, sequence, LengthOfMUM); if (performLIS) { // Step3(a) : sorted mum list based on reference sequence mumList = SortMum(mumList); if (mumList.Count > 0) { // Step3(b) : LIS using greedy cover algorithm mumList = CollectLongestIncreasingSubsequence(mumList); } else { mumList = null; } } lock (queryMums) { queryMums.Add(sequence, mumList); } } }); return(queryMums); }
/// <summary> /// Traverse the suffix tree using query sequence and return list of MUMs /// </summary> /// <param name="suffixTree">Suffix tree</param> /// <param name="referenceSequence">Reference sequence</param> /// <param name="sequence">Query sequence</param> /// <param name="lengthOfMUM">Minimum length of MUM</param> /// <returns>List of MUMs</returns> protected override IList <MaxUniqueMatch> Streaming( ISuffixTree suffixTree, ISequence referenceSequence, ISequence sequence, long lengthOfMUM) { ISuffixTreeBuilder suffixTreeBuilder = Factory.CreateNew(referenceSequence); return(suffixTreeBuilder.FindMatches(suffixTree, sequence, lengthOfMUM)); }
private static void GetNodeSuffixImpl(ISuffixTree t, ISuffixNode p, StringBuilder sb) { if (p.Parent != null) { GetNodeSuffixImpl(t, p.Parent, sb); int length = (p.IsLeaf ? t.Text.Length - 1 : p.Edge.End) - p.Edge.Start + 1; sb.Append(t.Text.Substring(p.Edge.Start, length)); } }
/// <summary> /// /// </summary> /// <param name="suffixTree"></param> public NUCmer(ISuffixTree suffixTree) { if (suffixTree == null) { throw new ArgumentNullException("suffixTree"); } // Mummer with the reference sequence. _internalMummer = new MUMmer.MUMmer(suffixTree); _internalReferenceSequence = _internalMummer.ReferenceSequence; SetDefaults(); }
/// <summary> /// Initializes a new instance of the MUMmer class with the specified suffix tree. /// This enables to use custom suffix tree. /// </summary> /// <param name="suffixTree">Suffix tree.</param> public MUMmer(ISuffixTree suffixTree) { if (suffixTree == null) { throw new ArgumentNullException("suffixTree"); } this.suffixTree = suffixTree; this.ReferenceSequence = this.suffixTree.Sequence; // Default Min length of Match - set to 20. this.LengthOfMUM = 20; this.NoAmbiguity = false; this.Name = Properties.Resource.MUMmerName; this.Description = Properties.Resource.MUMmerDescription; }
public static SuffixTreeDiagnostic create(ISuffixTree suffixTree) { if (suffixTree != null) { var diag = new SuffixTreeDiagnostic() { Tree = suffixTree }; diag.build(suffixTree.Root); return(diag); } else { throw new ArgumentNullException("tree"); } }
private static void Diagnose(ISuffixTree t) { Debug.WriteLine(""); var diagnostics = new SuffixTreeDiagnostics(t); diagnostics.Run(); Debug.WriteLine(""); Debug.WriteLine("suffix links count: {0} ", diagnostics.InternalCount); Debug.WriteLine("----------------------- "); diagnostics.Display(SuffixTreeDiagnostics.DisplayInfo.DisplayContent | SuffixTreeDiagnostics.DisplayInfo.DisplaySuffixLinks); Debug.WriteLine(""); Debug.WriteLine("suffix count (leaves): {0}", diagnostics.SuffixesCount); Debug.WriteLine("-------------------------- "); diagnostics.Display(SuffixTreeDiagnostics.DisplayInfo.DisplayContent | SuffixTreeDiagnostics.DisplayInfo.DisplaySuffixes); }
public void TestStreamingInMemorySimpleSequence() { string sequenceString = "AGTATGCCCCCCCCCCTGCCG"; Sequence sequence = new Sequence(Alphabets.Protein, sequenceString); using (SimpleSuffixTreeBuilder simpleSuffixTreeBuilder = new SimpleSuffixTreeBuilder()) { ISuffixTree inMemorySuffixTree = simpleSuffixTreeBuilder.BuildSuffixTree(sequence); string queryString = "CCCCCCCCTATG"; Sequence querySequence = new Sequence(Alphabets.Protein, queryString); IList <MaxUniqueMatch> MUMs = simpleSuffixTreeBuilder.FindMatches(inMemorySuffixTree, querySequence, 3); // Verify the count of MUMs found Assert.AreEqual(2, MUMs.Count); } }
/// <summary> /// Initializes a new instance of the MUMmer class with specified reference sequence. /// </summary> /// <param name="referenceSequence">Reference sequence.</param> public MUMmer(ISequence referenceSequence) { if (referenceSequence == null) { throw new ArgumentNullException("referenceSequence"); } this.ReferenceSequence = referenceSequence; // build the suffix tree for the reference sequence. this.suffixTree = new MultiWaySuffixTree(referenceSequence); // Default Min length of Match - set to 20. this.LengthOfMUM = 20; this.NoAmbiguity = false; this.Name = Properties.Resource.MUMmerName; this.Description = Properties.Resource.MUMmerDescription; }
public void TestFindMaximumMatchPersistentInSequence() { string sequenceString = "BANANA"; Sequence sequence = new Sequence(Alphabets.Protein, sequenceString); using (SimpleSuffixTreeBuilder simpleSuffixTreeBuilder = new SimpleSuffixTreeBuilder()) { simpleSuffixTreeBuilder.PersistenceThreshold = 0; ISuffixTree simpleSuffixTree = simpleSuffixTreeBuilder.BuildSuffixTree(sequence); string queryString = "ANA"; Sequence querySequence = new Sequence(Alphabets.Protein, queryString); IList <MaxUniqueMatch> MUMs = simpleSuffixTreeBuilder.FindMaximumMatches(simpleSuffixTree, querySequence, 3); // Verify the count of MUMs found Assert.AreEqual(1, MUMs.Count); } }
public void TestFindMaximumMatchInSequence() { string sequenceString = "BANANA"; Sequence sequence = new Sequence(Alphabets.Protein, sequenceString); ApplicationLog.WriteLine("Begin SuffixTree Test for string '{0}'", sequenceString); ISuffixTreeBuilder kurtzSuffixTreeBuilder = new KurtzSuffixTreeBuilder(); ISuffixTree kurtzSuffixTree = kurtzSuffixTreeBuilder.BuildSuffixTree(sequence); string queryString = "ANA"; Sequence querySequence = new Sequence(Alphabets.Protein, queryString); ApplicationLog.WriteLine("Query string : {0}. Minimum Length of MUM : 3.", queryString); ApplicationLog.WriteTime("Start Time.", DateTime.Now.ToString()); IList <MaxUniqueMatch> MUMs = kurtzSuffixTreeBuilder.FindMaximumMatches(kurtzSuffixTree, querySequence, 3); ApplicationLog.WriteTime("End Time.", DateTime.Now.ToString()); // Verify the count of MUMs found Assert.AreEqual(1, MUMs.Count); }
/// <summary> /// This method is considered as main execute method which defines the /// step by step algorithm. Drived class flows the defined flow by this /// method. /// </summary> /// <param name="referenceSequenceList">reference sequence</param> /// <param name="querySequenceList">list of input sequences</param> /// <returns>A list of sequence alignment</returns> private IList <IPairwiseSequenceAlignment> Alignment( IList <ISequence> referenceSequenceList, IList <ISequence> querySequenceList) { // Initializations if (referenceSequenceList.Count > 0) { if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(referenceSequenceList[0].Alphabet); } else { ConsensusResolver.SequenceAlphabet = referenceSequenceList[0].Alphabet; } } IList <IPairwiseSequenceAlignment> results = new List <IPairwiseSequenceAlignment>(); IPairwiseSequenceAlignment sequenceAlignment = null; IList <DeltaAlignment> deltaAlignments = null; IList <PairwiseAlignedSequence> alignments = null; ISequence referenceSequence = null; // Validate the input Validate(referenceSequenceList, querySequenceList); // Step:1 concat all the sequences into one sequence if (referenceSequenceList.Count > 1) { referenceSequence = ConcatSequence(referenceSequenceList); } else { referenceSequence = referenceSequenceList[0]; } // Getting refernce sequence _referenceSequence = referenceSequence; // Step2 : building suffix trees using reference sequence _suffixTree = BuildSuffixTree(_referenceSequence); // On each query sequence aligned with reference sequence foreach (ISequence sequence in querySequenceList) { if (sequence.Equals(referenceSequence)) { continue; } sequenceAlignment = new PairwiseSequenceAlignment(referenceSequence, sequence); // Step3 : streaming process is performed with the query sequence _mumList = Streaming(_suffixTree, referenceSequence, sequence, LengthOfMUM); if (_mumList.Count > 0) { // Step 5 : Get the list of Clusters _clusterList = GetClusters(_mumList); // Step 7: Process Clusters and get delta deltaAlignments = ProcessCluster( referenceSequenceList, _clusterList); // Step 8: Convert delta alignments to sequence alignments alignments = ConvertDeltaToAlignment(deltaAlignments); if (alignments.Count > 0) { foreach (PairwiseAlignedSequence align in alignments) { // Calculate the score of alignment align.Score = CalculateScore( align.FirstSequence, align.SecondSequence); // Make Consensus align.Consensus = MakeConsensus( align.FirstSequence, align.SecondSequence); sequenceAlignment.PairwiseAlignedSequences.Add(align); } } } results.Add(sequenceAlignment); } return(results); }
public SuffixTreeDiagnostics(ISuffixTree tree) { this.Tree = tree; }
public static SuffixTreeDiagnostic create(ISuffixTree suffixTree) { if (suffixTree != null) { var diag = new SuffixTreeDiagnostic() { Tree = suffixTree }; diag.build(suffixTree.Root); return diag; } else throw new ArgumentNullException("tree"); }
/// <summary> /// This method is considered as main execute method which defines the /// step by step algorithm. Drived class flows the defined flow by this /// method. Store generated MUMs in properties MUMs, SortedMUMs. /// Alignment first finds MUMs for all the query sequence, and then /// runs pairwise algorithm on gaps to produce alignments. /// </summary> /// <param name="referenceSequence">reference sequence</param> /// <param name="querySequenceList">list of input sequences</param> /// <returns>A list of sequence alignments</returns> private IList <IPairwiseSequenceAlignment> AlignmentWithAccumulatedMUMs( ISequence referenceSequence, IList <ISequence> querySequenceList) { // Get MUMs _mums = new Dictionary <ISequence, IList <MaxUniqueMatch> >(); _finalMums = new Dictionary <ISequence, IList <MaxUniqueMatch> >(); if (Validate(referenceSequence, querySequenceList)) { IList <MaxUniqueMatch> mumList; // Step1 : building suffix trees using reference sequence ISuffixTree suffixTree = BuildSuffixTree(referenceSequence); // On each query sequence aligned with reference sequence foreach (ISequence sequence in querySequenceList) { if (sequence.Equals(referenceSequence)) { continue; } // Step2 : streaming process is performed with the query sequence mumList = Streaming(suffixTree, referenceSequence, sequence, LengthOfMUM); _mums.Add(sequence, mumList); // Step3(a) : sorted mum list based on reference sequence mumList = SortMum(mumList); if (mumList.Count > 0) { // Step3(b) : LIS using greedy cover algorithm mumList = CollectLongestIncreasingSubsequence(mumList); } else { mumList = null; } _finalMums.Add(sequence, mumList); } } IList <IPairwiseSequenceAlignment> results = new List <IPairwiseSequenceAlignment>(); IPairwiseSequenceAlignment alignment = null; if (MUMs != null && FinalMUMs != null) { // Getting refernce sequence _referenceSequence = referenceSequence; // On each query sequence aligned with reference sequence foreach (var finalMum in FinalMUMs) { var sequence = finalMum.Key; _mumList = MUMs[sequence]; _finalMumList = finalMum.Value; alignment = new PairwiseSequenceAlignment(referenceSequence, sequence); if (_mumList.Count > 0) { if (_finalMumList.Count > 0) { // Step 4 : get all the gaps in each sequence and call // pairwise alignment alignment.PairwiseAlignedSequences.Add(ProcessGaps(referenceSequence, sequence)); } results.Add(alignment); } else { IList <IPairwiseSequenceAlignment> sequenceAlignment = RunPairWise( referenceSequence, sequence); foreach (IPairwiseSequenceAlignment pairwiseAlignment in sequenceAlignment) { results.Add(pairwiseAlignment); } } } } return(results); }
public TextMatcher(ISuffixTree suffixTree) { this.Tree = suffixTree; }
/// <summary> /// Traverse the suffix tree using query sequence and return list of matches /// </summary> /// <param name="suffixTree">Suffix tree</param> /// <param name="referenceSequence">Reference seqeunce</param> /// <param name="sequence">Query sequence</param> /// <param name="lengthOfMUM">Minimum length of MUM</param> /// <returns>List of matches</returns> protected abstract IList <MaxUniqueMatch> Streaming( ISuffixTree suffixTree, ISequence referenceSequence, ISequence sequence, long lengthOfMUM);
/// <summary> /// This method is considered as main execute method which defines the /// step by step algorithm. Drived class flows the defined flow by this /// method. Does not store MUMs, processes MUMs and gaps to find /// alignment directly. /// </summary> /// <param name="referenceSequence">reference sequence</param> /// <param name="querySequenceList">list of input sequences</param> /// <returns>A list of sequence alignments</returns> private IList <IPairwiseSequenceAlignment> AlignmentWithoutAccumulatedMUMs( ISequence referenceSequence, IList <ISequence> querySequenceList) { IList <IPairwiseSequenceAlignment> results = new List <IPairwiseSequenceAlignment>(); IPairwiseSequenceAlignment alignment = null; if (Validate(referenceSequence, querySequenceList)) { // Safety check for public methods to ensure that null // inputs are handled. if (referenceSequence == null || querySequenceList == null) { return(null); } // Getting refernce sequence _referenceSequence = referenceSequence; // Step1 : building suffix trees using reference sequence _suffixTree = BuildSuffixTree(_referenceSequence); // On each query sequence aligned with reference sequence foreach (ISequence sequence in querySequenceList) { if (sequence.Equals(referenceSequence)) { continue; } alignment = new PairwiseSequenceAlignment(referenceSequence, sequence); // Step2 : streaming process is performed with the query sequence _mumList = Streaming(_suffixTree, referenceSequence, sequence, LengthOfMUM); // Step3(a) : sorted mum list based on reference sequence _sortedMumList = SortMum(_mumList); if (_sortedMumList.Count > 0) { // Step3(b) : LIS using greedy cover algorithm _finalMumList = CollectLongestIncreasingSubsequence(_sortedMumList); if (_finalMumList.Count > 0) { // Step 4 : get all the gaps in each sequence and call // pairwise alignment alignment.PairwiseAlignedSequences.Add(ProcessGaps(referenceSequence, sequence)); } results.Add(alignment); } else { IList <IPairwiseSequenceAlignment> sequenceAlignment = RunPairWise( referenceSequence, sequence); foreach (IPairwiseSequenceAlignment pairwiseAlignment in sequenceAlignment) { results.Add(pairwiseAlignment); } } } } return(results); }
/// <summary> /// Find the matches of sequence in suffix tree /// </summary> /// <param name="suffixTree">Suffix tree to searh on</param> /// <param name="searchSequence">query sequence to find matches</param> /// <param name="lengthOfMUM">Minimum length of the match</param> /// <returns>Matches found</returns> private IList <MaxUniqueMatch> FindMatchWithOption( ISuffixTree suffixTree, ISequence searchSequence, long lengthOfMUM) { if (suffixTree == null) { throw new ArgumentNullException("suffixTree"); } if (searchSequence == null) { throw new ArgumentNullException("searchSequence"); } IMultiWaySuffixTree mwSuffixTree = suffixTree as IMultiWaySuffixTree; if (mwSuffixTree == null) { throw new ArgumentNullException("suffixTree"); } ValidateSequence(suffixTree.Sequence, searchSequence); // Initialize _minimumLengthOfMUM = lengthOfMUM; _suffixTree = mwSuffixTree; InitializeReferenceSequence(suffixTree.Sequence); InitializeQuerySequence(searchSequence); int interval = (int)(_querySequence.Count - (_minimumLengthOfMUM - 1)) / Environment.ProcessorCount; if (interval < 1) { interval = 1; } IList <Task <List <MaxUniqueMatch> > > result = new List <Task <List <MaxUniqueMatch> > >(); for (int index = 0; index < _querySequence.Count - (_minimumLengthOfMUM - 1); index += interval) { int taskIndex = index; result.Add( Task.Factory.StartNew <List <MaxUniqueMatch> >( o => FindMUMs(taskIndex, interval), TaskCreationOptions.None)); } List <MaxUniqueMatch> mergedList = new List <MaxUniqueMatch>(); foreach (List <MaxUniqueMatch> local in result.Select(l => l.Result)) { // Check if there is overlap, last MUM of mergedList overlaps with first MUM of local if (0 == mergedList.Count) { mergedList.AddRange(local.Select(m => m)); } else { if (0 < local.Count) { MaxUniqueMatch previous = mergedList.Last(); MaxUniqueMatch current = local.First(); if ((current.SecondSequenceStart >= previous.SecondSequenceStart && current.SecondSequenceStart <= previous.SecondSequenceStart + previous.Length) && (current.SecondSequenceStart + current.Length >= previous.SecondSequenceStart && current.SecondSequenceStart + current.Length <= previous.SecondSequenceStart + previous.Length)) { local.RemoveAt(0); } if (0 < local.Count) { mergedList.AddRange(local.Select(m => m)); } } } } // Order the mum list with query sequence order for (int index = 0; index < mergedList.Count; index++) { mergedList[index].FirstSequenceMumOrder = index + 1; mergedList[index].SecondSequenceMumOrder = index + 1; } return(mergedList); }