/// <summary> /// Traverse the suffix tree, following the longest path from the root that matches a prefix of words[wordNum]. /// This allows the caller to skip over these duplicate characters, and process only the part of the coming word. /// </summary> /// <param name="active">The current active suffix</param> /// <param name="endIndex">The number of characters skipped</param> /// <param name="wordNum">The index of the current word begin processed</param> /// <seealso cref="http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides08.pdf"> /// The first 10 slides of this slideshow by Pekka Kilpeläinen /// have useful tips on creating a generalized suffix tree. /// </seealso> /// <remarks> /// TODO: Note: The following method is WORK IN PROGRESS, and does not yet work. /// </remarks> private void skipDuplicateInitialSubstring(ref GstSuffix active, ref int endIndex, int wordNum) { GstNode curNode = root; GstEdge nextEdge = null; GstEdge curEdge = null; // Traverse matching edges while ( (endIndex < wordDict[wordNum].Length) && ((nextEdge = curNode.GetChildEdge(GetWordChar(wordNum, endIndex))) != null) ) { int strLen = nextEdge.Span(0) + 1; // edgeStr = String in next edge string edgeStr = nextEdge.GetText(); // wordStr = next segment of upcoming word that corresponds to edgeStr string wordStr = wordDict[wordNum].Substring(endIndex, Math.Min(strLen, wordDict[wordNum].Length - endIndex)); bool foundMismatch = false; int numCharsMatched = 0; // Traverse matching characters within edge for (int i = 0; i < strLen; i++) { if (edgeStr[i] == wordStr[i]) { numCharsMatched++; } else { foundMismatch = true; break; } } if (foundMismatch) { GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( " skipDuplicateInitialSubstring: Word #{0:d} does not cover existing edge #{1:d}", wordNum, nextEdge.Id)); active.OriginNode = nextEdge.ParentNode; active.EndIndex = active.BeginIndex; break; } else { nextEdge.SetBeginIndex(wordNum, endIndex); nextEdge.SetEndIndex(wordNum, endIndex + strLen - 1); GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( " skipDuplicateInitialSubstring: Word #{0:d} covers existing edge #{1:d} ({2:s})", wordNum, nextEdge.Id, nextEdge.ToString(wordNum))); active.OriginNode = nextEdge.ChildNode; active.BeginIndex += numCharsMatched; active.EndIndex = active.BeginIndex; } endIndex += numCharsMatched; // Set up next iteration of loop curEdge = nextEdge; curNode = curEdge.ChildNode; } }
/// <summary> /// Rule #1 (Ukkonen's first group of t_i-transitions): Try to find matching edge for the parent node. /// </summary> /// <param name="parentNode">This is a member of active. It is kept separate for clarity.</param> private ExtensionResult extendSuffixByRuleOne( ref GstSuffix active, ref GstNode parentNode, int endIndex, int wordNum) { if (active.IsExplicit) { GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, endIndex)); if (edge != null && edge.IsSet()) { return(ExtensionResult.Done); } } else // active suffix is implicit { GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, active.BeginIndex)); int span = active.EndIndex - active.BeginIndex; if (edge != null) { int extantWordNum = edge.GetExtantWordNum(); if (GetWordChar(extantWordNum, edge.GetBeginIndex(extantWordNum) + span + 1) == GetWordChar(wordNum, endIndex)) { return(ExtensionResult.Done); } GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( " Rule #1: About to split edge E{0:d} (\"{1:s}\") at suffix {2:s}", edge.Id, edge.GetText(), active.ToString())); parentNode = edge.Split(active); } } return(ExtensionResult.NotDone); }
public GstEdge GetChildEdge(char c) { GstEdge childEdge = null; childEdges.TryGetValue(c, out childEdge); return(childEdge); }
public void MoveFromTo(GstNode oldParentNode, char oldFirstChar, GstNode newParentNode, char newFirstChar) { GstEdge self = oldParentNode.GetChildEdge(oldFirstChar); if (self != this) { throw new ArgumentException("Error: MoveTo called with incorrect parent node and/or first char arguments"); } oldParentNode.RemoveChildEdge(oldFirstChar); newParentNode.AddChildEdge(newFirstChar, this); }
private string toStringEdgeTable(int wordNum, bool doAddIds = true, bool doAddTree = true) { StringBuilder sb = new StringBuilder(); string edgesBanner = toStringEdgeBanner(wordDict.Values.Select(w => w.Length).Max()); string addIdsSpacer = " Id "; int[] wordNums = (wordNum == GSuffixTree.NoWordNum) ? wordDict.Keys.ToArray() : new int[] { wordNum }; if (doAddIds) { sb.Append(addIdsSpacer); } sb.AppendLine(edgesBanner); foreach (DepthTaggedGstEdge dtEdge in DepthTaggedEdges()) { GstEdge edge = dtEdge.Edge; string formatStr = " {0,-11:d}{1,-11:d}{2,-11:s}{3,-11:d}{4,-11:d}{5,-9:d}{6,-" + Math.Max(7, wordDict.Values.Select(w => 1 + w.Length).Max()).ToString() + ":s}"; for (int i = 0; i < wordNums.Length; i++) { if (!edge.HasWordNum(wordNums[i])) { continue; } if (doAddIds) { sb.Append(String.Format(" {0,-4:d}", edge.Id)); } sb.Append(String.Format(formatStr, edge.ParentNode.Id, edge.ChildNode.Id, (edge.ChildNode.SuffixNode == null ? "null" : edge.ChildNode.SuffixNode.Id.ToString()), wordNums[i], edge.GetBeginIndex(wordNums[i]), edge.GetEndIndex(wordNums[i]), (new String(' ', edge.GetBeginIndex(wordNums[i]))) + GetRangeString(wordNums[i], edge.GetBeginIndex(wordNums[i]), edge.GetEndIndex(wordNums[i])) )); if (doAddTree) { string depthStr = new String(' ', 2 * dtEdge.Depth - 1) + "*"; sb.AppendLine(depthStr); } } } return(sb.ToString()); }
/// <summary> /// Rule #2 (Ukkonen's second group of t_i-transitions): /// Create a new edge and add it to the tree at the parent's position. // Part of this is inserting the new edge into the hash table, // and creating a suffix link to the new node from the last one visited. /// </summary> /// <param name="parentNode">This is a member of active. It is kept separate for clarity.</param> private void extendSuffixByRuleTwo( ref GstSuffix active, GstNode parentNode, ref GstNode prevParentNode, int endIndex, int wordNum) { GstEdge newEdge = new GstEdge(this, parentNode, wordNum, endIndex, GetWord(wordNum).Length - 1); newEdge.Add(); GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format( " Rule #2: New edge E{0:d} (\"{1:s}\") connects N{2:d} (old parent) to N{3:d} (new child)", newEdge.Id, newEdge.GetText(), newEdge.ParentNode.Id, newEdge.ChildNode.Id )); setSuffixLink(prevParentNode, parentNode); prevParentNode = parentNode; }
public GstNode Split(GstSuffix s) { // Create new edge int wordNum = s.WordNum; GstEdge newEdge; if (this.HasWordNum(s.WordNum)) { newEdge = new GstEdge(tree, s.OriginNode, wordNum, GetBeginIndex(wordNum), GetBeginIndex(wordNum) + s.Span); } else { newEdge = new GstEdge(tree, s.OriginNode, wordNum, s.BeginIndex, s.EndIndex); } foreach (int n in beginIndexes.Keys) { newEdge.SetBeginIndex(n, beginIndexes[n]); } foreach (int n in endIndexes.Keys) { newEdge.SetEndIndex(n, beginIndexes[n] + s.Span); } newEdge.ChildNode.SuffixNode = s.OriginNode; char oldFirstChar = GetFirstChar(); // Modify old edge int [] wordNums = beginIndexes.Keys.ToArray(); foreach (int n in wordNums) { IncBeginIndex(n, s.Span + 1); } // Perform switch MoveFromTo(ParentNode, oldFirstChar, newEdge.ChildNode, GetFirstChar()); ParentNode = newEdge.ChildNode; newEdge.Add(); GstUtil.WriteLine(GstVerbosityLevel.Normal, String.Format( " Split E{0:d} into E{1:d} + E{0:d} = \"{2:s}\" + \"{3:s}\"", Id, newEdge.Id, newEdge.GetText(), this.GetText() )); return(newEdge.ChildNode); }
public IEnumerable <GstEdge> Edges() { Stack <GstEdge> edges = new Stack <GstEdge>(); foreach (GstEdge edge in root.ChildEdges()) { edges.Push(edge); } while (edges.Count > 0) { GstEdge edge = edges.Pop(); foreach (GstEdge childEdge in edge.ChildNode.ChildEdges()) { edges.Push(childEdge); } yield return(edge); } yield break; }
/// <remarks> /// Constraint: Implicit suffixes must have BeginIndex < words[wordNum].Length /// </remarks> public void Canonicalize() { if (IsImplicit) { bool haveValuesChanged = false; StringBuilder sb = new StringBuilder(); sb.AppendLine(" Canonicalize: Entering"); // sb.AppendLine(tree.ToString()); int origNodeId, begin, end; origNodeId = this.OriginNode.Id; begin = this.beginIndex; end = this.endIndex; GstEdge edge = OriginNode.GetChildEdge(tree.GetWordChar(WordNum, BeginIndex)); while (edge.Span() <= Span) { sb.Append(String.Format( " Canonicalize: Active suffix changed from {0:s}", ToSuffixString(origNodeId, begin, end))); this.beginIndex += edge.Span() + 1; this.OriginNode = edge.ChildNode; haveValuesChanged = true; sb.AppendLine(String.Format(" to {0:s}", ToSuffixString(OriginNode.Id, beginIndex, endIndex))); if (Span >= 0) { edge = edge.ChildNode.GetChildEdge(tree.GetWordChar(0, BeginIndex)); } } sb.AppendLine(" Canonicalize: Exiting"); if (haveValuesChanged) { GstUtil.Write(GstVerbosityLevel.Verbose, sb.ToString()); } } }
public DepthTaggedGstEdge(GstEdge e, int d) { Edge = e; Depth = d; }
private static bool validateSuffixStrings( GSuffixTree tree, out List <int> failedLeafNodeIds) { var edgeStringDicts = new Stack <EdgeStringDict>(); // Step 1: Populate edgeStrings with data from child edges of the root node. // Track any leaves that are immediate children of the root node. var leafEdgeStringDicts = new List <EdgeStringDict>(); foreach (GstEdge edge in tree.Root.ChildEdges()) { var edgeStringDict = new EdgeStringDict(edge, new Dictionary <int, string>()); foreach (int wordNum in edge.WordNums()) { edgeStringDict.Item2[wordNum] = edge.GetText(); edgeStringDicts.Push(edgeStringDict); } if (!edge.ChildNode.HasChildEdges()) { Console.WriteLine(String.Format( "SuffixTreeTest: Found a leaf edge adjacent to the root: E{0:d}", edge.Id)); leafEdgeStringDicts.Add(edgeStringDict); } } // Step 2: Walk the tree, adding the remaining edges. Keep track of leaf edges. // Also keep a running record of accumulated text for each edge. while (edgeStringDicts.Count > 0) { EdgeStringDict edgeStringDict = edgeStringDicts.Pop(); foreach (GstEdge childEdge in edgeStringDict.Item1.ChildNode.ChildEdges()) { EdgeStringDict newEdgeStringDict = new EdgeStringDict(childEdge, new Dictionary <int, string>()); foreach (int wordNum in childEdge.WordNums()) { newEdgeStringDict.Item2[wordNum] = edgeStringDict.Item2[wordNum] + childEdge.GetText(); } edgeStringDicts.Push(newEdgeStringDict); if (!childEdge.ChildNode.HasChildEdges()) { Console.WriteLine(String.Format( "SuffixTreeTest: Found a leaf not adjacent to the root: E{0:s}", newEdgeStringDict.Item1.Id)); leafEdgeStringDicts.Add(newEdgeStringDict); } } } // Step 3: Inspect the leaf edge content (i.e., strings). Keep track of failed leaf nodes failedLeafNodeIds = new List <int>(); foreach (var leafEdgeStringDict in leafEdgeStringDicts) { // Accumulated string should equal the corresponding substring of tree.Text. GstEdge edge = leafEdgeStringDict.Item1; foreach (int wordNum in leafEdgeStringDict.Item2.Keys) { int len = leafEdgeStringDict.Item2[wordNum].Length; string pathStr = leafEdgeStringDict.Item2[wordNum]; string textStr = tree.GetRangeString(wordNum, tree.GetWord(wordNum).Length - len, tree.GetWord(wordNum).Length - 1); string formatSpec2 = "{2" /* + "," + tree.GetWord(0).Length.ToString() */ + ":s}"; string formatSpec3 = "{3" /* + "," + tree.GetWord(0).Length.ToString() */ + ":s}"; string formatStr = "SuffixTreeTest: Leaf edge #{0:d}, word#{1:d}. " + String.Format("Comparing \"{0:s}\" with \"{1:s}\"", formatSpec2, formatSpec3); Console.WriteLine(formatStr, edge.Id, wordNum, pathStr, textStr); if (pathStr != textStr) { failedLeafNodeIds.Add(leafEdgeStringDict.Item1.ChildNode.Id); break; } } } return(failedLeafNodeIds.Count() == 0); }
public void AddChildEdge(char c, GstEdge edge) { childEdges.Add(c, edge); }