/// <summary> /// Traverse the suffix tree, following the longest path from the root that matches a prefix of words[wordNum]. /// This allows the caller to skip over these duplicate characters, and process only the part of the coming word. /// </summary> /// <param name="active">The current active suffix</param> /// <param name="endIndex">The number of characters skipped</param> /// <param name="wordNum">The index of the current word begin processed</param> /// <seealso cref="http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides08.pdf"> /// The first 10 slides of this slideshow by Pekka Kilpeläinen /// have useful tips on creating a generalized suffix tree. /// </seealso> /// <remarks> /// TODO: Note: The following method is WORK IN PROGRESS, and does not yet work. /// </remarks> private void skipDuplicateInitialSubstring(ref GstSuffix active, ref int endIndex, int wordNum) { GstNode curNode = root; GstEdge nextEdge = null; GstEdge curEdge = null; // Traverse matching edges while ( (endIndex < wordDict[wordNum].Length) && ((nextEdge = curNode.GetChildEdge(GetWordChar(wordNum, endIndex))) != null) ) { int strLen = nextEdge.Span(0) + 1; // edgeStr = String in next edge string edgeStr = nextEdge.GetText(); // wordStr = next segment of upcoming word that corresponds to edgeStr string wordStr = wordDict[wordNum].Substring(endIndex, Math.Min(strLen, wordDict[wordNum].Length - endIndex)); bool foundMismatch = false; int numCharsMatched = 0; // Traverse matching characters within edge for (int i = 0; i < strLen; i++) { if (edgeStr[i] == wordStr[i]) { numCharsMatched++; } else { foundMismatch = true; break; } } if (foundMismatch) { GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( " skipDuplicateInitialSubstring: Word #{0:d} does not cover existing edge #{1:d}", wordNum, nextEdge.Id)); active.OriginNode = nextEdge.ParentNode; active.EndIndex = active.BeginIndex; break; } else { nextEdge.SetBeginIndex(wordNum, endIndex); nextEdge.SetEndIndex(wordNum, endIndex + strLen - 1); GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( " skipDuplicateInitialSubstring: Word #{0:d} covers existing edge #{1:d} ({2:s})", wordNum, nextEdge.Id, nextEdge.ToString(wordNum))); active.OriginNode = nextEdge.ChildNode; active.BeginIndex += numCharsMatched; active.EndIndex = active.BeginIndex; } endIndex += numCharsMatched; // Set up next iteration of loop curEdge = nextEdge; curNode = curEdge.ChildNode; } }
private void incrSuffix(ref GstSuffix active, int wordNum) { int origNodeId, begin, end; origNodeId = active.OriginNode.Id; begin = active.BeginIndex; end = active.EndIndex; if (active.OriginNode.IsRoot()) { active.BeginIndex++; } else { active.OriginNode = active.OriginNode.SuffixNode; } active.Canonicalize(); if (origNodeId != active.OriginNode.Id || begin != active.BeginIndex || end != active.EndIndex) { GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( " incrSuffix: Active suffix changed from {0:s} to {1:s}", GstSuffix.ToSuffixString(origNodeId, begin, end), GstSuffix.ToSuffixString(active.OriginNode.Id, active.BeginIndex, active.EndIndex))); } }
/// <summary> /// Rule #1 (Ukkonen's first group of t_i-transitions): Try to find matching edge for the parent node. /// </summary> /// <param name="parentNode">This is a member of active. It is kept separate for clarity.</param> private ExtensionResult extendSuffixByRuleOne( ref GstSuffix active, ref GstNode parentNode, int endIndex, int wordNum) { if (active.IsExplicit) { GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, endIndex)); if (edge != null && edge.IsSet()) { return(ExtensionResult.Done); } } else // active suffix is implicit { GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, active.BeginIndex)); int span = active.EndIndex - active.BeginIndex; if (edge != null) { int extantWordNum = edge.GetExtantWordNum(); if (GetWordChar(extantWordNum, edge.GetBeginIndex(extantWordNum) + span + 1) == GetWordChar(wordNum, endIndex)) { return(ExtensionResult.Done); } GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( " Rule #1: About to split edge E{0:d} (\"{1:s}\") at suffix {2:s}", edge.Id, edge.GetText(), active.ToString())); parentNode = edge.Split(active); } } return(ExtensionResult.NotDone); }
private bool AddWord(string word, bool doConsoleVerbose = false) { if (word == null || word.Length == 0) { return(false); } GstUtil.WriteLine(GstVerbosityLevel.Verbose, new String('-', 40)); int wordNum = wordCount++; wordDict[wordNum] = word; GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( "Adding word #{0:d} (\"{1:s}\") to the suffix tree", wordNum, wordDict[wordNum])); GstSuffix active = new GstSuffix(this, root, wordNum, 0, GSuffixTree.InfiniteIndex); GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( "Created active (longest proper) suffix pointer: {0:s}", active.ToString())); int endIndex = 0; if (wordNum > 0) { skipDuplicateInitialSubstring(ref active, ref endIndex, wordNum); if (endIndex > 0) { GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( "The first {0:d} letter(s) of word #{1:d} are already in the suffix tree", endIndex, wordNum)); } } for ( ; endIndex < wordDict[wordNum].Length; endIndex++) { GstUtil.WriteLine(GstVerbosityLevel.Verbose, this.ToString()); GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( "Calling extendSuffixes() for word #{0:d}, with endIndex = {1:d} ('{2:c}') and active suffix = {3:s}", wordNum, endIndex, GetWordChar(wordNum, endIndex), active.ToString())); extendSuffixes(ref active, endIndex, wordNum); } if (doConsoleVerbose) { string logStr = String.Format("Done adding word #{0:d} (\"{1:s}\") to the suffix tree", wordNum, wordDict[wordNum]); GstUtil.WriteLine(GstVerbosityLevel.Verbose, logStr); Console.WriteLine(logStr); Console.WriteLine(this.ToString()); } return(true); }
/// <summary> /// Rule #2 (Ukkonen's second group of t_i-transitions): /// Create a new edge and add it to the tree at the parent's position. // Part of this is inserting the new edge into the hash table, // and creating a suffix link to the new node from the last one visited. /// </summary> /// <param name="parentNode">This is a member of active. It is kept separate for clarity.</param> private void extendSuffixByRuleTwo( ref GstSuffix active, GstNode parentNode, ref GstNode prevParentNode, int endIndex, int wordNum) { GstEdge newEdge = new GstEdge(this, parentNode, wordNum, endIndex, GetWord(wordNum).Length - 1); newEdge.Add(); GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( " Rule #2: New edge E{0:d} (\"{1:s}\") connects N{2:d} (old parent) to N{3:d} (new child)", newEdge.Id, newEdge.GetText(), newEdge.ParentNode.Id, newEdge.ChildNode.Id )); setSuffixLink(prevParentNode, parentNode); prevParentNode = parentNode; }
public GstNode Split(GstSuffix s) { // Create new edge int wordNum = s.WordNum; GstEdge newEdge; if (this.HasWordNum(s.WordNum)) { newEdge = new GstEdge(tree, s.OriginNode, wordNum, GetBeginIndex(wordNum), GetBeginIndex(wordNum) + s.Span); } else { newEdge = new GstEdge(tree, s.OriginNode, wordNum, s.BeginIndex, s.EndIndex); } foreach (int n in beginIndexes.Keys) { newEdge.SetBeginIndex(n, beginIndexes[n]); } foreach (int n in endIndexes.Keys) { newEdge.SetEndIndex(n, beginIndexes[n] + s.Span); } newEdge.ChildNode.SuffixNode = s.OriginNode; char oldFirstChar = GetFirstChar(); // Modify old edge int [] wordNums = beginIndexes.Keys.ToArray(); foreach (int n in wordNums) { IncBeginIndex(n, s.Span + 1); } // Perform switch MoveFromTo(ParentNode, oldFirstChar, newEdge.ChildNode, GetFirstChar()); ParentNode = newEdge.ChildNode; newEdge.Add(); GstUtil.WriteLine(GstVerbosityLevel.Normal, String.Format( " Split E{0:d} into E{1:d} + E{0:d} = \"{2:s}\" + \"{3:s}\"", Id, newEdge.Id, newEdge.GetText(), this.GetText() )); return(newEdge.ChildNode); }
private void extendSuffixes(ref GstSuffix active, int endIndex, int wordNum) { GstNode parentNode; GstNode prevParentNode = null; for ( ; ; incrSuffix(ref active, wordNum)) { parentNode = active.OriginNode; if (extendSuffixByRuleOne(ref active, ref parentNode, endIndex, wordNum) == ExtensionResult.Done) { break; } extendSuffixByRuleTwo(ref active, parentNode, ref prevParentNode, endIndex, wordNum); Debug.Assert(ValidateConsistentEdgeText(this, true)); } setSuffixLink(prevParentNode, parentNode); active.EndIndex++; active.Canonicalize(); }