public ISuffixTree BuildSuffixTree(ISequence sequence) { if (sequence == null) { throw new ArgumentNullException("sequence"); } // Initialize ValidateReferenceSequence(sequence); InitializeReferenceSequence(sequence); // Create Tasks Dictionary <byte, IList <int> > treeTasks = new Dictionary <byte, IList <int> >(); // Loop through subset of sequence string and build the suffix tree // this will loop through the sequence once and collect all the indexes needed. for (int index = 0; index < ReferenceLength; index++) { IList <int> startIndices = null; if (!treeTasks.TryGetValue(GetReferenceSymbol(index), out startIndices)) { startIndices = new List <int>(); treeTasks.Add(GetReferenceSymbol(index), startIndices); } startIndices.Add(index); } _distinctSymbolCount = treeTasks.Count; if (EdgeStorage == null) { EdgeStorage = new FileSuffixEdgeStorage(); } // Create Tasks IList <Task <IMultiWaySuffixTree> > tasks = treeTasks.Values.Select( indices => Task <IMultiWaySuffixTree> .Factory.StartNew( t => AppendSuffix(indices), TaskCreationOptions.None)).ToList(); // Wait for all the task Task.WaitAll(tasks.ToArray()); _suffixTree = CreateSuffixTree(); // Merge the branches of tree foreach (Task <IMultiWaySuffixTree> task in tasks) { _suffixTree.Merge(task.Result); } // return the suffix tree return(_suffixTree); }
private IMultiWaySuffixTree CreateSuffixTree() { // Calculate the memory requirement of suffix tree 2*N*Size of each edge long edgeSize = 0; int instanceSize = 0; int arraySize = 0; // Size of a reference (children) if (Environment.Is64BitProcess) { instanceSize = 16; arraySize = 32; } else { instanceSize = 12; arraySize = 16; } // Two Integers (StartIndex & EndIndex) edgeSize = 2 * sizeof(int) + instanceSize + arraySize; long requiredCapacity = edgeSize * 2 * ReferenceLength; double memoryCapacity = GetPhysicalMemory(); IMultiWaySuffixTree suffixTree = null; if (requiredCapacity < memoryCapacity) { suffixTree = new MultiWaySuffixTree(_referenceSequences, _distinctSymbolCount); } else { // One long (Key) edgeSize += sizeof(long); requiredCapacity = edgeSize * 2 * ReferenceLength; if (PersistenceThreshold == -1) { PersistenceThreshold = (int)(memoryCapacity / (edgeSize * _distinctSymbolCount)); } suffixTree = new PersistentMultiWaySuffixTree( _referenceSequences, _distinctSymbolCount, PersistenceThreshold, EdgeStorage); } return(suffixTree); }
/// <summary> /// Add suffix to the tree, the loop inside will break under two conditions /// 1. If you have reached the leaf node /// 2. If you have reached end of suffix /// </summary> /// <param name="startIndices">List of index of the first character of suffix</param> /// <returns>Suffix tree</returns> private IMultiWaySuffixTree AppendSuffix(IList <int> startIndices) { IMultiWaySuffixTree tree = CreateSuffixTree(); // Loop through subset of sequence string and build the suffix tree foreach (int index in startIndices) { int startIndex = index; IEdge parentEdge = tree.Root; IEdge edge = null; bool continueInsert = true; do { edge = tree.Find(parentEdge, GetReferenceSymbol(startIndex)); if (null == edge) { tree.Insert(parentEdge, startIndex, ReferenceLength - 1); continueInsert = false; break; } else { startIndex++; if (edge.StartIndex < edge.EndIndex) { for (int counter = edge.StartIndex + 1; counter <= edge.EndIndex; counter++) { if (GetReferenceSymbol(startIndex) != GetReferenceSymbol(counter)) { parentEdge = tree.Split(edge, counter - 1); // Add the leaf edge tree.Insert(parentEdge, startIndex, ReferenceLength - 1); continueInsert = false; break; } startIndex++; } } parentEdge = edge; } } while (startIndex < ReferenceLength && continueInsert); } return(tree); }
public void TestInMemorySimpleSequence() { string sequenceString = "BANANA"; Sequence sequence = new Sequence(Alphabets.Protein, sequenceString); using (SimpleSuffixTreeBuilder simpleSuffixTreeBuilder = new SimpleSuffixTreeBuilder()) { IMultiWaySuffixTree inMemorySuffixTree = simpleSuffixTreeBuilder.BuildSuffixTree(sequence) as IMultiWaySuffixTree; // Verify the edges in Suffix Tree Assert.AreEqual(7, inMemorySuffixTree.Count); // Verify the sequence in Suffix Tree Assert.AreEqual(inMemorySuffixTree.Sequence.ToString(), sequenceString); } }
public void TestPersistentSimpleSequence() { string sequenceString = "BANANA"; Sequence sequence = new Sequence(Alphabets.Protein, sequenceString); using (SimpleSuffixTreeBuilder simpleSuffixTreeBuilder = new SimpleSuffixTreeBuilder()) { simpleSuffixTreeBuilder.PersistenceThreshold = 0; IMultiWaySuffixTree persistentSuffixTree = simpleSuffixTreeBuilder.BuildSuffixTree(sequence) as IMultiWaySuffixTree; // Verify the edges in Suffix Tree Assert.AreEqual(7, persistentSuffixTree.Count); // Verify the sequence in Suffix Tree Assert.AreEqual(persistentSuffixTree.Sequence.ToString(), sequenceString); } }
/// <summary> /// Merge the given branch at the root of Suffix Tree. /// Asummption: /// The root node of the given branch contains only one edge, which is the branch to be merged. /// </summary> /// <param name="branch">Branch to be merged.</param> /// <returns>Success flag.</returns> public bool Merge(IMultiWaySuffixTree branch) { if (branch == null) { throw new NotImplementedException("branch"); } MultiWaySuffixEdge mwBranchEdge = branch.Root as MultiWaySuffixEdge; if (mwBranchEdge.GetChildren() == null) { return(false); } MultiWaySuffixEdge mwRoot = Root as MultiWaySuffixEdge; if (mwRoot.GetChildren() == null) { mwRoot.AddChild(mwBranchEdge.GetChildren()[0]); Count += (branch.Count - 1); // - the original root edge of branch return(true); } if (mwRoot.GetChildren().Length < _maximumChildrenCount) { mwRoot.AddChild(mwBranchEdge.GetChildren()[0]); Count += (branch.Count - 1); // - the original root edge of branch return(true); } // No more children edge can be added. throw new InvalidOperationException(string.Format( CultureInfo.CurrentCulture, "Cannot add more than {0} child nodes to edge.", _maximumChildrenCount)); }
/// <summary> /// Find the matches of sequence in suffix tree /// </summary> /// <param name="suffixTree">Suffix tree to searh on</param> /// <param name="searchSequence">query sequence to find matches</param> /// <param name="lengthOfMUM">Minimum length of the match</param> /// <returns>Matches found</returns> private IList <MaxUniqueMatch> FindMatchWithOption( ISuffixTree suffixTree, ISequence searchSequence, long lengthOfMUM) { if (suffixTree == null) { throw new ArgumentNullException("suffixTree"); } if (searchSequence == null) { throw new ArgumentNullException("searchSequence"); } IMultiWaySuffixTree mwSuffixTree = suffixTree as IMultiWaySuffixTree; if (mwSuffixTree == null) { throw new ArgumentNullException("suffixTree"); } ValidateSequence(suffixTree.Sequence, searchSequence); // Initialize _minimumLengthOfMUM = lengthOfMUM; _suffixTree = mwSuffixTree; InitializeReferenceSequence(suffixTree.Sequence); InitializeQuerySequence(searchSequence); int interval = (int)(_querySequence.Count - (_minimumLengthOfMUM - 1)) / Environment.ProcessorCount; if (interval < 1) { interval = 1; } IList <Task <List <MaxUniqueMatch> > > result = new List <Task <List <MaxUniqueMatch> > >(); for (int index = 0; index < _querySequence.Count - (_minimumLengthOfMUM - 1); index += interval) { int taskIndex = index; result.Add( Task.Factory.StartNew <List <MaxUniqueMatch> >( o => FindMUMs(taskIndex, interval), TaskCreationOptions.None)); } List <MaxUniqueMatch> mergedList = new List <MaxUniqueMatch>(); foreach (List <MaxUniqueMatch> local in result.Select(l => l.Result)) { // Check if there is overlap, last MUM of mergedList overlaps with first MUM of local if (0 == mergedList.Count) { mergedList.AddRange(local.Select(m => m)); } else { if (0 < local.Count) { MaxUniqueMatch previous = mergedList.Last(); MaxUniqueMatch current = local.First(); if ((current.SecondSequenceStart >= previous.SecondSequenceStart && current.SecondSequenceStart <= previous.SecondSequenceStart + previous.Length) && (current.SecondSequenceStart + current.Length >= previous.SecondSequenceStart && current.SecondSequenceStart + current.Length <= previous.SecondSequenceStart + previous.Length)) { local.RemoveAt(0); } if (0 < local.Count) { mergedList.AddRange(local.Select(m => m)); } } } } // Order the mum list with query sequence order for (int index = 0; index < mergedList.Count; index++) { mergedList[index].FirstSequenceMumOrder = index + 1; mergedList[index].SecondSequenceMumOrder = index + 1; } return(mergedList); }