/// <summary> /// Traverse the suffix tree from the specified Edge and updates the startIndexes list. /// </summary> /// <param name="current">Edge to start traversing from.</param> /// <param name="length">Length of the edge for which the startIndexes are needed.</param> /// <param name="startIndexes">List containing the start indexes.</param> private static void DepthFirstIterativeTraversal( MultiWaySuffixEdge current, long length, List<long> startIndexes) { var stack = new Stack<Tuple<MultiWaySuffixEdge, byte, long>>(); int childIndex = 0; if (current.IsLeaf) { startIndexes.Add(current.StartIndex - length); } else { bool done = false; while (!done) { bool intermediateEdgeFound = false; int count = current.Children.Length; long currentEdgeLength = current.Children[0].StartIndex - current.StartIndex; for (; childIndex < count; childIndex++) { if (current.Children[childIndex].IsLeaf) { startIndexes.Add(current.Children[childIndex].StartIndex - (length + currentEdgeLength)); } else { stack.Push( new Tuple<MultiWaySuffixEdge, byte, long>(current, (byte)(childIndex + 1), length)); current = current.Children[childIndex]; childIndex = 0; length = currentEdgeLength + length; intermediateEdgeFound = true; break; } } if (!intermediateEdgeFound) { if (stack.Count > 0) { Tuple<MultiWaySuffixEdge, byte, long> item = stack.Pop(); current = item.Item1; childIndex = item.Item2; length = item.Item3; } else { done = true; } } } } }
/// <summary> /// Initializes a new instance of the MultiWaySuffixTree class with the specified sequence. /// </summary> /// <param name="sequence">Sequence to build the suffix tree.</param> public MultiWaySuffixTree(ISequence sequence) { if (sequence == null) { throw new ArgumentNullException("sequence"); } if (sequence.Count == 0) { throw new ArgumentOutOfRangeException("sequence", Resource.EmptySequence); } byte[] aliasMap = sequence.Alphabet.GetSymbolValueMap(); this.uniqueSymbolsInReference = new HashSet<byte>(); this.uniqueSymbolsStartIndexes = new long[byte.MaxValue + 1]; var convertedValeus = new byte[sequence.Count]; for (int index = 0; index < sequence.Count; index++) { byte symbol = aliasMap[sequence[index]]; if (!this.uniqueSymbolsInReference.Contains(symbol)) { this.uniqueSymbolsStartIndexes[symbol] = index; this.uniqueSymbolsInReference.Add(symbol); } convertedValeus[index] = symbol; } this.Sequence = sequence; this.referenceSequence = new Sequence(sequence.Alphabet, convertedValeus, false); this.symbolsCount = sequence.Count; this.Name = Resource.MultiWaySuffixTreeName; this.MinLengthOfMatch = 20; this.NoAmbiguity = false; // Create root edge. this.rootEdge = new MultiWaySuffixEdge(); this.edgesCount++; this.supportedBaseAlphabet = sequence.Alphabet; IAlphabet alphabet; while (Alphabets.AlphabetToBaseAlphabetMap.TryGetValue(this.supportedBaseAlphabet, out alphabet)) { this.supportedBaseAlphabet = alphabet; } // Build the suffix tree. this.BuildSuffixTree(); // Update tree with suffixLinks. this.UpdateSuffixLinks(); }
/// <summary> /// Updates the suffix link of a child edge of specified edge. /// </summary> /// <param name="parenetEdge">Parent edge.</param> /// <param name="childIndex">Index of the child to update.</param> private void UpdateSuffixLinkForEdge(MultiWaySuffixEdge parenetEdge, int childIndex) { MultiWaySuffixEdge childEdge = parenetEdge.Children[childIndex]; long childStartIndex = childEdge.StartIndex; long childEndIndex = childEdge.Children[0].StartIndex - 1; long childSymbolCount = childEndIndex - childStartIndex + 1; MultiWaySuffixEdge parentSuffixLink = parenetEdge.SuffixLink[0]; int childCount = parentSuffixLink.Children.Length; byte symbol = this.referenceSequence[childStartIndex]; for (int index = 0; index < childCount; index++) { MultiWaySuffixEdge edge = parentSuffixLink.Children[index]; // SuffixLinks will point to another intermediate edges only. if (edge.IsLeaf) { continue; } long edgeStartIndex = edge.StartIndex; if (this.referenceSequence[edgeStartIndex] == symbol) { while (true) { long edgeEndIndex = edge.Children[0].StartIndex - 1; // compare the first symbol of the edge. long edgeSymbolCount = edgeEndIndex - edgeStartIndex + 1; if (childSymbolCount == edgeSymbolCount) { childEdge.SuffixLink[0] = edge; return; } childSymbolCount = childSymbolCount - edgeSymbolCount; childStartIndex += edgeSymbolCount; long edgeChildCount = edge.Children.Length; symbol = this.referenceSequence[childStartIndex]; for (int edgeChildIndex = 0; edgeChildIndex < edgeChildCount; edgeChildIndex++) { if (this.referenceSequence[edge.Children[edgeChildIndex].StartIndex] == symbol) { // get the child of edge and continue searching. edge = edge.Children[edgeChildIndex]; edgeStartIndex = edge.StartIndex; break; } } } } } }
/// <summary> /// Gets the intermediate edges present in the path of the match between specified edges for the next query index to /// match. /// </summary> /// <param name="fromEdge">Edge from the which to search from.</param> /// <param name="toedge">Edge where to stop the search.</param> /// <param name="matchLengthOfFromEdge">Matching symbols count of the fromEdge.</param> /// <param name="lengthToSearch">Length to search.</param> /// <param name="nextQueryIndex">Next query index.</param> /// <param name="convertedSearchSeq">Converted search sequence.</param> /// <param name="minLengthOfMatch">Minimum length of match required.</param> /// <returns>Returns the intermediate edges found between the fromEdge to toEdge.</returns> private Stack<EdgesFound> GetIntermediateEdges( MultiWaySuffixEdge fromEdge, MultiWaySuffixEdge toedge, long matchLengthOfFromEdge, long lengthToSearch, long nextQueryIndex, ISequence convertedSearchSeq, long minLengthOfMatch) { var edgesFoundForNextQueryIndex = new Stack<EdgesFound>(); var edge = new MultiWaySuffixEdge(); long edgeStartIndex = 0; if (toedge.IsLeaf || fromEdge.IsLeaf) { return edgesFoundForNextQueryIndex; } matchLengthOfFromEdge--; MultiWaySuffixEdge previousIntermediateEdge = fromEdge.SuffixLink[0]; long childIndexToStop = toedge.SuffixLink[0].StartIndex; long searchIndex = nextQueryIndex + matchLengthOfFromEdge; int childCount = previousIntermediateEdge.Children.Length; // if the previousIntermediateEdge is rootEdge. if (previousIntermediateEdge.StartIndex == -1 && lengthToSearch > 0) { lengthToSearch--; } if (lengthToSearch > 0) { byte querySymbol = convertedSearchSeq[searchIndex]; for (int index = 0; index < childCount; index++) { edge = previousIntermediateEdge.Children[index]; edgeStartIndex = edge.StartIndex; byte refSymbol = edgeStartIndex < this.symbolsCount ? this.referenceSequence[edgeStartIndex] : TerminatingSymbol; if (refSymbol == querySymbol) { break; } } // When lengthOfMatchInEdge >0 there will be an edge from the previousIntermediateEdge. while (!edge.IsLeaf && edge.StartIndex != childIndexToStop) { long edgeEndIndex = edge.Children[0].StartIndex - 1; // compare the first symbol of the edge. long edgeSymbolCount = edgeEndIndex - edgeStartIndex + 1; if (lengthToSearch == edgeSymbolCount) { searchIndex += lengthToSearch; lengthToSearch = 0; previousIntermediateEdge = edge; break; } if (lengthToSearch > edgeSymbolCount) { lengthToSearch -= edgeSymbolCount; searchIndex += edgeSymbolCount; matchLengthOfFromEdge += edgeSymbolCount; long edgeChildCount = edge.Children.Length; querySymbol = convertedSearchSeq[searchIndex]; for (int edgeChildIndex = 0; edgeChildIndex < edgeChildCount; edgeChildIndex++) { if (this.referenceSequence[edge.Children[edgeChildIndex].StartIndex] == querySymbol) { // get the child of edge and continue searching. previousIntermediateEdge = edge; if (matchLengthOfFromEdge >= minLengthOfMatch) { edgesFoundForNextQueryIndex.Push( new EdgesFound { Edge = previousIntermediateEdge, LengthOfMatch = matchLengthOfFromEdge }); } edge = edge.Children[edgeChildIndex]; edgeStartIndex = edge.StartIndex; break; } } } else { break; } } } return edgesFoundForNextQueryIndex; }
/// <summary> /// Builds the suffix tree. /// </summary> private void BuildSuffixTree() { int arraySize = this.uniqueSymbolsInReference.Max() + 1; var parentRootForSymbol = new MultiWaySuffixEdge[arraySize]; Parallel.ForEach( this.uniqueSymbolsInReference, symbol => { var edge = new MultiWaySuffixEdge(); for (long index = this.uniqueSymbolsStartIndexes[symbol]; index < this.symbolsCount; index++) { byte symbolAtIndex = this.referenceSequence[index]; if (symbol != symbolAtIndex) { continue; } MultiWaySuffixEdge parent = parentRootForSymbol[symbol]; long startIndex = index; MultiWaySuffixEdge[] arrayConainingParent = null; int indexOfArrayContainingParent = -1; bool continueInsert = true; bool duplicatedConsecutiveSymbolsFound = true; do { byte symbolAtStartIndex = TerminatingSymbol; if (startIndex < this.symbolsCount) { symbolAtStartIndex = this.referenceSequence[startIndex]; } int indexOfEdgeFound = -1; int childCount = 0; if (!parent.IsLeaf) { // Find edge start childCount = parent.Children.Length; for (int i = 0; i < childCount; i++) { MultiWaySuffixEdge childEdge = parent.Children[i]; if (childEdge.StartIndex < this.symbolsCount) { byte edgeSymbol = this.referenceSequence[childEdge.StartIndex]; if (edgeSymbol == symbolAtStartIndex) { edge = childEdge; indexOfEdgeFound = i; startIndex++; if (edgeSymbol != symbolAtIndex) { duplicatedConsecutiveSymbolsFound = false; } break; } } } } MultiWaySuffixEdge newEdge; if (indexOfEdgeFound == -1) { // Insert new child newEdge = new MultiWaySuffixEdge(startIndex); Array.Resize(ref parent.Children, childCount + 1); parent.Children[childCount] = newEdge; parent.SuffixLink = new MultiWaySuffixEdge[1]; Interlocked.Increment(ref this.edgesCount); // Assign back modified edge. if (arrayConainingParent == null) { parentRootForSymbol[symbol] = parent; } else { arrayConainingParent[indexOfArrayContainingParent] = parent; } continueInsert = false; break; } long edgeEndIndex = this.symbolsCount; if (!edge.IsLeaf) { // return the minimum start index of children -1 edgeEndIndex = edge.Children[0].StartIndex - 1; } // Do not enter if only one symbol is there in the edge. if (edge.StartIndex < edgeEndIndex) { long duplicatedConsicutiveSymbolsCount = 0; for (long counter = edge.StartIndex + 1; counter <= edgeEndIndex; counter++, startIndex++) { symbolAtStartIndex = TerminatingSymbol; if (startIndex < this.symbolsCount) { symbolAtStartIndex = this.referenceSequence[startIndex]; } byte symbolAtCounter = TerminatingSymbol; if (counter < this.symbolsCount) { symbolAtCounter = this.referenceSequence[counter]; } if (symbolAtStartIndex != symbolAtCounter) { // Split the edge // Create the new edge // Copy the children of old edge to new edge newEdge = new MultiWaySuffixEdge(counter) { Children = edge.Children, SuffixLink = edge.SuffixLink }; edge.Children = new MultiWaySuffixEdge[2]; // for split edge and leaf edge. // As this is an internal node allocate the array here itself to avoid updating // the parent array with the new address of the edge (as MultiWaySuffixEdge is a value type). edge.SuffixLink = new MultiWaySuffixEdge[1]; edge.SuffixLink[0].StartIndex = -1; // Create leaf edge. var leafEdge = new MultiWaySuffixEdge(startIndex); if (duplicatedConsecutiveSymbolsFound && duplicatedConsicutiveSymbolsCount > 1) { for (int duplicatedIndex = 1; duplicatedIndex < duplicatedConsicutiveSymbolsCount; duplicatedIndex++) { var duplicateSymbolEdge = new MultiWaySuffixEdge(newEdge.StartIndex - 1); duplicateSymbolEdge.Children = new MultiWaySuffixEdge[2]; duplicateSymbolEdge.SuffixLink = new MultiWaySuffixEdge[1]; duplicateSymbolEdge.SuffixLink[0].StartIndex = -1; duplicateSymbolEdge.Children[0] = newEdge; duplicateSymbolEdge.Children[1] = leafEdge; // we are adding two edges here - duplicatesymbol edge and leaf edge. // leaf edge will be duplicated. Interlocked.Increment(ref this.edgesCount); Interlocked.Increment(ref this.edgesCount); newEdge = duplicateSymbolEdge; } index += duplicatedConsicutiveSymbolsCount - 1; } // Update the old edge // Set new edge as child edge to old edge edge.Children[0] = newEdge; // Add the leaf edge. edge.Children[1] = leafEdge; Interlocked.Increment(ref this.edgesCount); Interlocked.Increment(ref this.edgesCount); // assign back edge that got modified. parent.Children[indexOfEdgeFound] = edge; continueInsert = false; duplicatedConsecutiveSymbolsFound = false; break; } if (duplicatedConsecutiveSymbolsFound) { duplicatedConsicutiveSymbolsCount++; if (symbolAtIndex != symbolAtStartIndex) { duplicatedConsecutiveSymbolsFound = false; } } } } if (continueInsert) { arrayConainingParent = parent.Children; indexOfArrayContainingParent = indexOfEdgeFound; parent = edge; } } while ((startIndex <= this.symbolsCount) && continueInsert); } }); this.rootEdge.StartIndex = -1; int rootChildrenCount = this.uniqueSymbolsInReference.Count + 1; Array.Resize(ref this.rootEdge.Children, rootChildrenCount); int rootChildIndex = 0; // Add all symbol root's child to the rootEdge. foreach (byte symbol in this.uniqueSymbolsInReference) { this.rootEdge.Children[rootChildIndex] = parentRootForSymbol[symbol].Children[0]; rootChildIndex++; } // Add edge for $. this.rootEdge.Children[rootChildrenCount - 1] = new MultiWaySuffixEdge(this.symbolsCount); this.edgesCount++; }
public void ValidateEdgesForALeaf() { MultiWaySuffixEdge rootEdge = new MultiWaySuffixEdge(); Assert.AreEqual(rootEdge.IsLeaf, true); ApplicationLog.WriteLine("MUMmer BVT : Successfully Validated Is leaf property."); Assert.AreEqual(rootEdge.Children, null); ApplicationLog.WriteLine("MUMmer BVT : Successfully Validated Children property for a Leaf. "); Assert.AreEqual(rootEdge.StartIndex, 0); ApplicationLog.WriteLine("MUMmer BVT : Successfully Validated start index of a Leaf."); }