/// <summary> /// constructor that takes relative text position /// </summary> /// <param name="start"></param> /// <param name="end"></param> public Edge(Node node, int start, int end = -1) { if (node == null) { throw new ArgumentNullException("node"); } if (start < 0) { throw new ArgumentOutOfRangeException("start", "start cannot be negative"); } // pretend that "end" can be infinite, and then compare with start if (start > (uint)end) { throw new ArgumentOutOfRangeException("start", "cannot start the string after its end"); } // infinity is just -1 if (end < 0) { end = -1; } this.Start = start; this.End = end; this.EndNode = node; }
/// <summary> /// Construct the tree for a given string of text /// </summary> /// <param name="text">text from which the tree is constructed</param> public SuffixTree(string text) { if (string.IsNullOrWhiteSpace(text)) { throw new ArgumentNullException(text); } Text = text; _activeNode = new Node(_currentNodeNumber); RootNode = _activeNode; }
/// <summary> /// Creates the actual suffix tree /// </summary> public void Create() { // some of our loop iterations actually constitute one route // in that case we should not chose to accidentally slip and // follow the suffix node bool followSuffixNode = false; for (int i=0; i < Text.Length;) { // make sure the lower bound remains within its boundaries ValidateAndUpdateMinDistance(i); var nodeEdge = _activeNode.FindNextRoute(i + _activeLength, followSuffixNode); //if we have terminated in a non-leaf node we are done if (i + _activeLength >= Text.Length && nodeEdge == null) { break; } // we could not find anything, add to the tree if (nodeEdge == null) { _activeNode.AddNode(++_currentNodeNumber, i + _activeLength); _lastBranchIndex = i + _activeLength; i++; followSuffixNode = true; continue; } var node = nodeEdge.Item1; var edge = nodeEdge.Item2; if (edge == null) { //we found a suffix node _activeNode = node; _activeLength--; followSuffixNode = false; continue; } else if(node != null) { //we found a new active _activeNode = node; _activeLength++; followSuffixNode = false; continue; } // now walk the chosen path and see where the current suffix diverges var edgePosTuple = edge.WalkTheEdge(i, ref _activeLength, ref _minDistance, ref _activeNode); edge = edgePosTuple.Item1; int j = edgePosTuple.Item2; if (j == edge.Route.Length) { _activeNode = edge.EndNode; _activeLength += edge.Route.Length; followSuffixNode = false; continue; } // we now need to insert a new branch node _minDistance = j; _lastBranchIndex = i + j + _activeLength; if (_lastBranchIndex >= Text.Length) { i++; followSuffixNode = true; continue; } // we are inserting a new branch node var newBranchNode = edge.Split(edge.Start + j - 1, ++_currentNodeNumber); // if we have reached this branch node through a route of just // one character - the last branch node should be set as the if (edge.Route.Length == 1) { newBranchNode.SuffixPointer = _activeNode; } // the second check is because of the root-node suffix pointer special case // above if (null != _lastBranchNode && _lastBranchNode.SuffixPointer == null) { _lastBranchNode.SuffixPointer = newBranchNode; } newBranchNode.AddNode(++_currentNodeNumber, _lastBranchIndex); _lastBranchNode = newBranchNode; i++; followSuffixNode = true; } }
/// <summary> /// Adds a new node to the tree /// </summary> /// <param name="label">Node label</param> /// <param name="start">Start position in the text</param> /// <param name="end">End position in the text</param> internal void AddNode(uint label, int start, int end = -1) { var newNode = new Node(label); var newEdge = new Edge(newNode, start, end); this.Edges.Add(newEdge.Route[0], newEdge); }
/// <summary> /// Splits the edge into two new edges. /// </summary> /// <param name="end">Index of the end of the old edge</param> /// <returns></returns> internal Node Split(int end, uint currentNodeNumber) { int nextStart = end + 1; var oldNode = this.EndNode; var newEdge = new Edge(oldNode, nextStart, this.End); Node newNode = new Node(currentNodeNumber); this.End = end; this.EndNode = newNode; newNode.Edges.Add(newEdge.Route[0], newEdge); return newNode; }
/// <summary> /// Keep comparing original text from position i /// with what is in the edge /// </summary> /// <param name="i">Index of comparison start in the original text</param> /// <param name="skipCharacters"> How many characters are guaranteed equal</param> /// <returns>(edge, index) - the edje the character in it where the walk ended</returns> internal Tuple<Edge, int> WalkTheEdge(int i, ref int activeLength, ref int minDistance, ref Node activeNode) { string text = SuffixTree.Text; int skipCharacters = minDistance; int index = i + activeLength; // we know we do not need any comparisons on this edge if (skipCharacters >= this.Route.Length) { var edge = this.EndNode.FindEdgeByChar(i + this.Route.Length); activeLength += this.Route.Length; minDistance -= this.Route.Length; activeNode = this.EndNode; return edge.WalkTheEdge(i, ref activeLength, ref minDistance, ref activeNode); } int j = Walk(text, index, skipCharacters); return new Tuple<Edge, int>(this, j); }