/// <summary>
            ///     Traverse the suffix tree, following the longest path from the root that matches a prefix of words[wordNum].
            ///     This allows the caller to skip over these duplicate characters, and process only the part of the coming word.
            ///  </summary>
            /// <param name="active">The current active suffix</param>
            /// <param name="endIndex">The number of characters skipped</param>
            /// <param name="wordNum">The index of the current word begin processed</param>
            /// <seealso cref="http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides08.pdf">
            ///     The first 10 slides of this slideshow by Pekka Kilpeläinen
            ///     have useful tips on creating a generalized suffix tree.
            /// </seealso>
            /// <remarks>
            ///     TODO: Note: The following method is WORK IN PROGRESS, and does not yet work.
            /// </remarks>
            private void skipDuplicateInitialSubstring(ref GstSuffix active, ref int endIndex, int wordNum)
            {
                GstNode curNode  = root;
                GstEdge nextEdge = null;
                GstEdge curEdge  = null;

                // Traverse matching edges
                while (
                    (endIndex < wordDict[wordNum].Length) &&
                    ((nextEdge = curNode.GetChildEdge(GetWordChar(wordNum, endIndex))) != null)
                    )
                {
                    int strLen = nextEdge.Span(0) + 1;
                    // edgeStr = String in next edge
                    string edgeStr = nextEdge.GetText();
                    // wordStr = next segment of upcoming word that corresponds to edgeStr
                    string wordStr = wordDict[wordNum].Substring(endIndex, Math.Min(strLen, wordDict[wordNum].Length - endIndex));

                    bool foundMismatch   = false;
                    int  numCharsMatched = 0;
                    // Traverse matching characters within edge
                    for (int i = 0; i < strLen; i++)
                    {
                        if (edgeStr[i] == wordStr[i])
                        {
                            numCharsMatched++;
                        }
                        else
                        {
                            foundMismatch = true; break;
                        }
                    }

                    if (foundMismatch)
                    {
                        GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                              "  skipDuplicateInitialSubstring: Word #{0:d} does not cover existing edge #{1:d}",
                                              wordNum, nextEdge.Id));
                        active.OriginNode = nextEdge.ParentNode;
                        active.EndIndex   = active.BeginIndex;
                        break;
                    }
                    else
                    {
                        nextEdge.SetBeginIndex(wordNum, endIndex);
                        nextEdge.SetEndIndex(wordNum, endIndex + strLen - 1);
                        GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                              "  skipDuplicateInitialSubstring: Word #{0:d} covers existing edge #{1:d} ({2:s})",
                                              wordNum, nextEdge.Id, nextEdge.ToString(wordNum)));
                        active.OriginNode  = nextEdge.ChildNode;
                        active.BeginIndex += numCharsMatched;
                        active.EndIndex    = active.BeginIndex;
                    }
                    endIndex += numCharsMatched;

                    // Set up next iteration of loop
                    curEdge = nextEdge;
                    curNode = curEdge.ChildNode;
                }
            }
 /// <summary>
 ///     Rule #1 (Ukkonen's first group of t_i-transitions): Try to find matching edge for the parent node.
 /// </summary>
 /// <param name="parentNode">This is a member of active.  It is kept separate for clarity.</param>
 private ExtensionResult extendSuffixByRuleOne(
     ref GstSuffix active, ref GstNode parentNode, int endIndex, int wordNum)
 {
     if (active.IsExplicit)
     {
         GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, endIndex));
         if (edge != null && edge.IsSet())
         {
             return(ExtensionResult.Done);
         }
     }
     else    // active suffix is implicit
     {
         GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, active.BeginIndex));
         int     span = active.EndIndex - active.BeginIndex;
         if (edge != null)
         {
             int extantWordNum = edge.GetExtantWordNum();
             if (GetWordChar(extantWordNum, edge.GetBeginIndex(extantWordNum) + span + 1)
                 == GetWordChar(wordNum, endIndex))
             {
                 return(ExtensionResult.Done);
             }
             GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                   "  Rule #1: About to split edge E{0:d} (\"{1:s}\") at suffix {2:s}",
                                   edge.Id, edge.GetText(), active.ToString()));
             parentNode = edge.Split(active);
         }
     }
     return(ExtensionResult.NotDone);
 }
            public GstEdge GetChildEdge(char c)
            {
                GstEdge childEdge = null;

                childEdges.TryGetValue(c, out childEdge);
                return(childEdge);
            }
Exemple #4
0
            public void MoveFromTo(GstNode oldParentNode, char oldFirstChar, GstNode newParentNode, char newFirstChar)
            {
                GstEdge self = oldParentNode.GetChildEdge(oldFirstChar);

                if (self != this)
                {
                    throw new ArgumentException("Error: MoveTo called with incorrect parent node and/or first char arguments");
                }
                oldParentNode.RemoveChildEdge(oldFirstChar);
                newParentNode.AddChildEdge(newFirstChar, this);
            }
            private string toStringEdgeTable(int wordNum,
                                             bool doAddIds  = true,
                                             bool doAddTree = true)
            {
                StringBuilder sb           = new StringBuilder();
                string        edgesBanner  = toStringEdgeBanner(wordDict.Values.Select(w => w.Length).Max());
                string        addIdsSpacer = "  Id  ";

                int[] wordNums = (wordNum == GSuffixTree.NoWordNum)
                    ? wordDict.Keys.ToArray()
                    : new int[] { wordNum };
                if (doAddIds)
                {
                    sb.Append(addIdsSpacer);
                }
                sb.AppendLine(edgesBanner);
                foreach (DepthTaggedGstEdge dtEdge in DepthTaggedEdges())
                {
                    GstEdge edge      = dtEdge.Edge;
                    string  formatStr = "  {0,-11:d}{1,-11:d}{2,-11:s}{3,-11:d}{4,-11:d}{5,-9:d}{6,-"
                                        + Math.Max(7, wordDict.Values.Select(w => 1 + w.Length).Max()).ToString()
                                        + ":s}";
                    for (int i = 0; i < wordNums.Length; i++)
                    {
                        if (!edge.HasWordNum(wordNums[i]))
                        {
                            continue;
                        }
                        if (doAddIds)
                        {
                            sb.Append(String.Format("  {0,-4:d}", edge.Id));
                        }
                        sb.Append(String.Format(formatStr,
                                                edge.ParentNode.Id, edge.ChildNode.Id,
                                                (edge.ChildNode.SuffixNode == null ? "null" : edge.ChildNode.SuffixNode.Id.ToString()),
                                                wordNums[i],
                                                edge.GetBeginIndex(wordNums[i]), edge.GetEndIndex(wordNums[i]),
                                                (new String(' ', edge.GetBeginIndex(wordNums[i]))) +
                                                GetRangeString(wordNums[i], edge.GetBeginIndex(wordNums[i]), edge.GetEndIndex(wordNums[i]))
                                                ));
                        if (doAddTree)
                        {
                            string depthStr = new String(' ', 2 * dtEdge.Depth - 1) + "*";
                            sb.AppendLine(depthStr);
                        }
                    }
                }
                return(sb.ToString());
            }
            /// <summary>
            ///     Rule #2 (Ukkonen's second group of t_i-transitions):
            ///         Create a new edge and add it to the tree at the parent's position.
            //          Part of this is inserting the new edge into the hash table,
            //          and creating a suffix link to the new node from the last one visited.
            /// </summary>
            /// <param name="parentNode">This is a member of active.  It is kept separate for clarity.</param>
            private void extendSuffixByRuleTwo(
                ref GstSuffix active, GstNode parentNode, ref GstNode prevParentNode, int endIndex, int wordNum)
            {
                GstEdge newEdge = new GstEdge(this, parentNode, wordNum, endIndex, GetWord(wordNum).Length - 1);

                newEdge.Add();
                GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                      "  Rule #2: New edge E{0:d} (\"{1:s}\") connects N{2:d} (old parent) to N{3:d} (new child)",
                                      newEdge.Id,
                                      newEdge.GetText(),
                                      newEdge.ParentNode.Id,
                                      newEdge.ChildNode.Id
                                      ));
                setSuffixLink(prevParentNode, parentNode);
                prevParentNode = parentNode;
            }
Exemple #7
0
            public GstNode Split(GstSuffix s)
            {
                // Create new edge
                int     wordNum = s.WordNum;
                GstEdge newEdge;

                if (this.HasWordNum(s.WordNum))
                {
                    newEdge = new GstEdge(tree, s.OriginNode, wordNum, GetBeginIndex(wordNum), GetBeginIndex(wordNum) + s.Span);
                }
                else
                {
                    newEdge = new GstEdge(tree, s.OriginNode, wordNum, s.BeginIndex, s.EndIndex);
                }
                foreach (int n in beginIndexes.Keys)
                {
                    newEdge.SetBeginIndex(n, beginIndexes[n]);
                }
                foreach (int n in endIndexes.Keys)
                {
                    newEdge.SetEndIndex(n, beginIndexes[n] + s.Span);
                }
                newEdge.ChildNode.SuffixNode = s.OriginNode;

                char oldFirstChar = GetFirstChar();

                // Modify old edge
                int [] wordNums = beginIndexes.Keys.ToArray();
                foreach (int n in wordNums)
                {
                    IncBeginIndex(n, s.Span + 1);
                }

                // Perform switch
                MoveFromTo(ParentNode, oldFirstChar, newEdge.ChildNode, GetFirstChar());
                ParentNode = newEdge.ChildNode;
                newEdge.Add();
                GstUtil.WriteLine(GstVerbosityLevel.Normal, String.Format(
                                      "  Split E{0:d} into E{1:d} + E{0:d} = \"{2:s}\" + \"{3:s}\"",
                                      Id, newEdge.Id,
                                      newEdge.GetText(),
                                      this.GetText()
                                      ));

                return(newEdge.ChildNode);
            }
            public IEnumerable <GstEdge> Edges()
            {
                Stack <GstEdge> edges = new Stack <GstEdge>();

                foreach (GstEdge edge in root.ChildEdges())
                {
                    edges.Push(edge);
                }
                while (edges.Count > 0)
                {
                    GstEdge edge = edges.Pop();
                    foreach (GstEdge childEdge in edge.ChildNode.ChildEdges())
                    {
                        edges.Push(childEdge);
                    }
                    yield return(edge);
                }
                yield break;
            }
            /// <remarks>
            ///     Constraint: Implicit suffixes must have BeginIndex < words[wordNum].Length
            /// </remarks>
            public void Canonicalize()
            {
                if (IsImplicit)
                {
                    bool          haveValuesChanged = false;
                    StringBuilder sb = new StringBuilder();
                    sb.AppendLine("  Canonicalize: Entering");
                    // sb.AppendLine(tree.ToString());

                    int origNodeId, begin, end;
                    origNodeId = this.OriginNode.Id;
                    begin      = this.beginIndex;
                    end        = this.endIndex;

                    GstEdge edge = OriginNode.GetChildEdge(tree.GetWordChar(WordNum, BeginIndex));
                    while (edge.Span() <= Span)
                    {
                        sb.Append(String.Format(
                                      "    Canonicalize: Active suffix changed from {0:s}",
                                      ToSuffixString(origNodeId, begin, end)));
                        this.beginIndex  += edge.Span() + 1;
                        this.OriginNode   = edge.ChildNode;
                        haveValuesChanged = true;
                        sb.AppendLine(String.Format(" to {0:s}",
                                                    ToSuffixString(OriginNode.Id, beginIndex, endIndex)));
                        if (Span >= 0)
                        {
                            edge = edge.ChildNode.GetChildEdge(tree.GetWordChar(0, BeginIndex));
                        }
                    }
                    sb.AppendLine("  Canonicalize: Exiting");
                    if (haveValuesChanged)
                    {
                        GstUtil.Write(GstVerbosityLevel.Verbose, sb.ToString());
                    }
                }
            }
Exemple #10
0
 public DepthTaggedGstEdge(GstEdge e, int d)
 {
     Edge  = e;
     Depth = d;
 }
                private static bool validateSuffixStrings(
                    GSuffixTree tree,
                    out List <int> failedLeafNodeIds)
                {
                    var edgeStringDicts = new Stack <EdgeStringDict>();

                    // Step 1: Populate edgeStrings with data from child edges of the root node.
                    //         Track any leaves that are immediate children of the root node.
                    var leafEdgeStringDicts = new List <EdgeStringDict>();

                    foreach (GstEdge edge in tree.Root.ChildEdges())
                    {
                        var edgeStringDict = new EdgeStringDict(edge, new Dictionary <int, string>());

                        foreach (int wordNum in edge.WordNums())
                        {
                            edgeStringDict.Item2[wordNum] = edge.GetText();
                            edgeStringDicts.Push(edgeStringDict);
                        }
                        if (!edge.ChildNode.HasChildEdges())
                        {
                            Console.WriteLine(String.Format(
                                                  "SuffixTreeTest: Found a leaf edge adjacent to the root: E{0:d}",
                                                  edge.Id));
                            leafEdgeStringDicts.Add(edgeStringDict);
                        }
                    }

                    // Step 2: Walk the tree, adding the remaining edges.  Keep track of leaf edges.
                    //      Also keep a running record of accumulated text for each edge.
                    while (edgeStringDicts.Count > 0)
                    {
                        EdgeStringDict edgeStringDict = edgeStringDicts.Pop();
                        foreach (GstEdge childEdge in edgeStringDict.Item1.ChildNode.ChildEdges())
                        {
                            EdgeStringDict newEdgeStringDict = new EdgeStringDict(childEdge, new Dictionary <int, string>());
                            foreach (int wordNum in childEdge.WordNums())
                            {
                                newEdgeStringDict.Item2[wordNum] = edgeStringDict.Item2[wordNum] + childEdge.GetText();
                            }
                            edgeStringDicts.Push(newEdgeStringDict);
                            if (!childEdge.ChildNode.HasChildEdges())
                            {
                                Console.WriteLine(String.Format(
                                                      "SuffixTreeTest: Found a leaf not adjacent to the root: E{0:s}",
                                                      newEdgeStringDict.Item1.Id));
                                leafEdgeStringDicts.Add(newEdgeStringDict);
                            }
                        }
                    }

                    // Step 3: Inspect the leaf edge content (i.e., strings).  Keep track of failed leaf nodes
                    failedLeafNodeIds = new List <int>();
                    foreach (var leafEdgeStringDict in leafEdgeStringDicts)
                    {
                        // Accumulated string should equal the corresponding substring of tree.Text.
                        GstEdge edge = leafEdgeStringDict.Item1;
                        foreach (int wordNum in leafEdgeStringDict.Item2.Keys)
                        {
                            int    len     = leafEdgeStringDict.Item2[wordNum].Length;
                            string pathStr = leafEdgeStringDict.Item2[wordNum];
                            string textStr = tree.GetRangeString(wordNum,
                                                                 tree.GetWord(wordNum).Length - len, tree.GetWord(wordNum).Length - 1);
                            string formatSpec2 = "{2" /* + "," + tree.GetWord(0).Length.ToString() */ + ":s}";
                            string formatSpec3 = "{3" /* + "," + tree.GetWord(0).Length.ToString() */ + ":s}";
                            string formatStr   = "SuffixTreeTest: Leaf edge #{0:d}, word#{1:d}.  "
                                                 + String.Format("Comparing \"{0:s}\" with \"{1:s}\"", formatSpec2, formatSpec3);
                            Console.WriteLine(formatStr, edge.Id, wordNum, pathStr, textStr);
                            if (pathStr != textStr)
                            {
                                failedLeafNodeIds.Add(leafEdgeStringDict.Item1.ChildNode.Id);
                                break;
                            }
                        }
                    }
                    return(failedLeafNodeIds.Count() == 0);
                }
 public void AddChildEdge(char c, GstEdge edge)
 {
     childEdges.Add(c, edge);
 }