/// <summary>
            ///     Traverse the suffix tree, following the longest path from the root that matches a prefix of words[wordNum].
            ///     This allows the caller to skip over these duplicate characters, and process only the part of the coming word.
            ///  </summary>
            /// <param name="active">The current active suffix</param>
            /// <param name="endIndex">The number of characters skipped</param>
            /// <param name="wordNum">The index of the current word begin processed</param>
            /// <seealso cref="http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides08.pdf">
            ///     The first 10 slides of this slideshow by Pekka Kilpeläinen
            ///     have useful tips on creating a generalized suffix tree.
            /// </seealso>
            /// <remarks>
            ///     TODO: Note: The following method is WORK IN PROGRESS, and does not yet work.
            /// </remarks>
            private void skipDuplicateInitialSubstring(ref GstSuffix active, ref int endIndex, int wordNum)
            {
                GstNode curNode  = root;
                GstEdge nextEdge = null;
                GstEdge curEdge  = null;

                // Traverse matching edges
                while (
                    (endIndex < wordDict[wordNum].Length) &&
                    ((nextEdge = curNode.GetChildEdge(GetWordChar(wordNum, endIndex))) != null)
                    )
                {
                    int strLen = nextEdge.Span(0) + 1;
                    // edgeStr = String in next edge
                    string edgeStr = nextEdge.GetText();
                    // wordStr = next segment of upcoming word that corresponds to edgeStr
                    string wordStr = wordDict[wordNum].Substring(endIndex, Math.Min(strLen, wordDict[wordNum].Length - endIndex));

                    bool foundMismatch   = false;
                    int  numCharsMatched = 0;
                    // Traverse matching characters within edge
                    for (int i = 0; i < strLen; i++)
                    {
                        if (edgeStr[i] == wordStr[i])
                        {
                            numCharsMatched++;
                        }
                        else
                        {
                            foundMismatch = true; break;
                        }
                    }

                    if (foundMismatch)
                    {
                        GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                              "  skipDuplicateInitialSubstring: Word #{0:d} does not cover existing edge #{1:d}",
                                              wordNum, nextEdge.Id));
                        active.OriginNode = nextEdge.ParentNode;
                        active.EndIndex   = active.BeginIndex;
                        break;
                    }
                    else
                    {
                        nextEdge.SetBeginIndex(wordNum, endIndex);
                        nextEdge.SetEndIndex(wordNum, endIndex + strLen - 1);
                        GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                              "  skipDuplicateInitialSubstring: Word #{0:d} covers existing edge #{1:d} ({2:s})",
                                              wordNum, nextEdge.Id, nextEdge.ToString(wordNum)));
                        active.OriginNode  = nextEdge.ChildNode;
                        active.BeginIndex += numCharsMatched;
                        active.EndIndex    = active.BeginIndex;
                    }
                    endIndex += numCharsMatched;

                    // Set up next iteration of loop
                    curEdge = nextEdge;
                    curNode = curEdge.ChildNode;
                }
            }
            private void incrSuffix(ref GstSuffix active, int wordNum)
            {
                int origNodeId, begin, end;

                origNodeId = active.OriginNode.Id;
                begin      = active.BeginIndex;
                end        = active.EndIndex;

                if (active.OriginNode.IsRoot())
                {
                    active.BeginIndex++;
                }
                else
                {
                    active.OriginNode = active.OriginNode.SuffixNode;
                }
                active.Canonicalize();

                if (origNodeId != active.OriginNode.Id ||
                    begin != active.BeginIndex ||
                    end != active.EndIndex)
                {
                    GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                          "  incrSuffix: Active suffix changed from {0:s} to {1:s}",
                                          GstSuffix.ToSuffixString(origNodeId, begin, end),
                                          GstSuffix.ToSuffixString(active.OriginNode.Id, active.BeginIndex, active.EndIndex)));
                }
            }
 /// <summary>
 ///     Rule #1 (Ukkonen's first group of t_i-transitions): Try to find matching edge for the parent node.
 /// </summary>
 /// <param name="parentNode">This is a member of active.  It is kept separate for clarity.</param>
 private ExtensionResult extendSuffixByRuleOne(
     ref GstSuffix active, ref GstNode parentNode, int endIndex, int wordNum)
 {
     if (active.IsExplicit)
     {
         GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, endIndex));
         if (edge != null && edge.IsSet())
         {
             return(ExtensionResult.Done);
         }
     }
     else    // active suffix is implicit
     {
         GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, active.BeginIndex));
         int     span = active.EndIndex - active.BeginIndex;
         if (edge != null)
         {
             int extantWordNum = edge.GetExtantWordNum();
             if (GetWordChar(extantWordNum, edge.GetBeginIndex(extantWordNum) + span + 1)
                 == GetWordChar(wordNum, endIndex))
             {
                 return(ExtensionResult.Done);
             }
             GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                   "  Rule #1: About to split edge E{0:d} (\"{1:s}\") at suffix {2:s}",
                                   edge.Id, edge.GetText(), active.ToString()));
             parentNode = edge.Split(active);
         }
     }
     return(ExtensionResult.NotDone);
 }
            private bool AddWord(string word, bool doConsoleVerbose = false)
            {
                if (word == null || word.Length == 0)
                {
                    return(false);
                }

                GstUtil.WriteLine(GstVerbosityLevel.Verbose, new String('-', 40));
                int wordNum = wordCount++;

                wordDict[wordNum] = word;
                GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                      "Adding word #{0:d} (\"{1:s}\") to the suffix tree",
                                      wordNum, wordDict[wordNum]));
                GstSuffix active = new GstSuffix(this, root, wordNum, 0, GSuffixTree.InfiniteIndex);

                GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                      "Created active (longest proper) suffix pointer: {0:s}",
                                      active.ToString()));
                int endIndex = 0;

                if (wordNum > 0)
                {
                    skipDuplicateInitialSubstring(ref active, ref endIndex, wordNum);
                    if (endIndex > 0)
                    {
                        GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                              "The first {0:d} letter(s) of word #{1:d} are already in the suffix tree",
                                              endIndex, wordNum));
                    }
                }
                for (   ; endIndex < wordDict[wordNum].Length; endIndex++)
                {
                    GstUtil.WriteLine(GstVerbosityLevel.Verbose, this.ToString());
                    GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                          "Calling extendSuffixes() for word #{0:d}, with endIndex = {1:d} ('{2:c}') and active suffix = {3:s}",
                                          wordNum, endIndex, GetWordChar(wordNum, endIndex), active.ToString()));
                    extendSuffixes(ref active, endIndex, wordNum);
                }
                if (doConsoleVerbose)
                {
                    string logStr = String.Format("Done adding word #{0:d} (\"{1:s}\") to the suffix tree",
                                                  wordNum, wordDict[wordNum]);
                    GstUtil.WriteLine(GstVerbosityLevel.Verbose, logStr);
                    Console.WriteLine(logStr);
                    Console.WriteLine(this.ToString());
                }
                return(true);
            }
            /// <summary>
            ///     Rule #2 (Ukkonen's second group of t_i-transitions):
            ///         Create a new edge and add it to the tree at the parent's position.
            //          Part of this is inserting the new edge into the hash table,
            //          and creating a suffix link to the new node from the last one visited.
            /// </summary>
            /// <param name="parentNode">This is a member of active.  It is kept separate for clarity.</param>
            private void extendSuffixByRuleTwo(
                ref GstSuffix active, GstNode parentNode, ref GstNode prevParentNode, int endIndex, int wordNum)
            {
                GstEdge newEdge = new GstEdge(this, parentNode, wordNum, endIndex, GetWord(wordNum).Length - 1);

                newEdge.Add();
                GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                      "  Rule #2: New edge E{0:d} (\"{1:s}\") connects N{2:d} (old parent) to N{3:d} (new child)",
                                      newEdge.Id,
                                      newEdge.GetText(),
                                      newEdge.ParentNode.Id,
                                      newEdge.ChildNode.Id
                                      ));
                setSuffixLink(prevParentNode, parentNode);
                prevParentNode = parentNode;
            }
Пример #6
0
            public GstNode Split(GstSuffix s)
            {
                // Create new edge
                int     wordNum = s.WordNum;
                GstEdge newEdge;

                if (this.HasWordNum(s.WordNum))
                {
                    newEdge = new GstEdge(tree, s.OriginNode, wordNum, GetBeginIndex(wordNum), GetBeginIndex(wordNum) + s.Span);
                }
                else
                {
                    newEdge = new GstEdge(tree, s.OriginNode, wordNum, s.BeginIndex, s.EndIndex);
                }
                foreach (int n in beginIndexes.Keys)
                {
                    newEdge.SetBeginIndex(n, beginIndexes[n]);
                }
                foreach (int n in endIndexes.Keys)
                {
                    newEdge.SetEndIndex(n, beginIndexes[n] + s.Span);
                }
                newEdge.ChildNode.SuffixNode = s.OriginNode;

                char oldFirstChar = GetFirstChar();

                // Modify old edge
                int [] wordNums = beginIndexes.Keys.ToArray();
                foreach (int n in wordNums)
                {
                    IncBeginIndex(n, s.Span + 1);
                }

                // Perform switch
                MoveFromTo(ParentNode, oldFirstChar, newEdge.ChildNode, GetFirstChar());
                ParentNode = newEdge.ChildNode;
                newEdge.Add();
                GstUtil.WriteLine(GstVerbosityLevel.Normal, String.Format(
                                      "  Split E{0:d} into E{1:d} + E{0:d} = \"{2:s}\" + \"{3:s}\"",
                                      Id, newEdge.Id,
                                      newEdge.GetText(),
                                      this.GetText()
                                      ));

                return(newEdge.ChildNode);
            }
            private void extendSuffixes(ref GstSuffix active, int endIndex, int wordNum)
            {
                GstNode parentNode;
                GstNode prevParentNode = null;

                for (   ; ; incrSuffix(ref active, wordNum))
                {
                    parentNode = active.OriginNode;
                    if (extendSuffixByRuleOne(ref active, ref parentNode, endIndex, wordNum)
                        == ExtensionResult.Done)
                    {
                        break;
                    }
                    extendSuffixByRuleTwo(ref active, parentNode, ref prevParentNode, endIndex, wordNum);
                    Debug.Assert(ValidateConsistentEdgeText(this, true));
                }
                setSuffixLink(prevParentNode, parentNode);
                active.EndIndex++;
                active.Canonicalize();
            }