private void setSuffixLink(GstNode node, GstNode suffixNode)
 {
     if ((node != null) && (node != root))
     {
         if (node.SuffixNode == null)
         {
             GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                   "  New suffix link from N{0:d} to N{1:d}",
                                   node.Id, suffixNode.Id));
         }
         else
         {
             if (node.SuffixNode.Id == suffixNode.Id)
             {
                 GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                       "  Suffix link (N{0:d} to N{1:d}) retaining same value",
                                       node.Id, node.SuffixNode.Id));
             }
             else
             {
                 GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                       "  Suffix link (N{0:d} to N{1:d}) set to new value (N{0:d} to N{2:d})",
                                       node.Id, node.SuffixNode.Id, suffixNode.Id));
             }
         }
         node.SuffixNode = suffixNode;
     }
 }
            /// <summary>
            ///     Traverse the suffix tree, following the longest path from the root that matches a prefix of words[wordNum].
            ///     This allows the caller to skip over these duplicate characters, and process only the part of the coming word.
            ///  </summary>
            /// <param name="active">The current active suffix</param>
            /// <param name="endIndex">The number of characters skipped</param>
            /// <param name="wordNum">The index of the current word begin processed</param>
            /// <seealso cref="http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides08.pdf">
            ///     The first 10 slides of this slideshow by Pekka Kilpeläinen
            ///     have useful tips on creating a generalized suffix tree.
            /// </seealso>
            /// <remarks>
            ///     TODO: Note: The following method is WORK IN PROGRESS, and does not yet work.
            /// </remarks>
            private void skipDuplicateInitialSubstring(ref GstSuffix active, ref int endIndex, int wordNum)
            {
                GstNode curNode  = root;
                GstEdge nextEdge = null;
                GstEdge curEdge  = null;

                // Traverse matching edges
                while (
                    (endIndex < wordDict[wordNum].Length) &&
                    ((nextEdge = curNode.GetChildEdge(GetWordChar(wordNum, endIndex))) != null)
                    )
                {
                    int strLen = nextEdge.Span(0) + 1;
                    // edgeStr = String in next edge
                    string edgeStr = nextEdge.GetText();
                    // wordStr = next segment of upcoming word that corresponds to edgeStr
                    string wordStr = wordDict[wordNum].Substring(endIndex, Math.Min(strLen, wordDict[wordNum].Length - endIndex));

                    bool foundMismatch   = false;
                    int  numCharsMatched = 0;
                    // Traverse matching characters within edge
                    for (int i = 0; i < strLen; i++)
                    {
                        if (edgeStr[i] == wordStr[i])
                        {
                            numCharsMatched++;
                        }
                        else
                        {
                            foundMismatch = true; break;
                        }
                    }

                    if (foundMismatch)
                    {
                        GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                              "  skipDuplicateInitialSubstring: Word #{0:d} does not cover existing edge #{1:d}",
                                              wordNum, nextEdge.Id));
                        active.OriginNode = nextEdge.ParentNode;
                        active.EndIndex   = active.BeginIndex;
                        break;
                    }
                    else
                    {
                        nextEdge.SetBeginIndex(wordNum, endIndex);
                        nextEdge.SetEndIndex(wordNum, endIndex + strLen - 1);
                        GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                              "  skipDuplicateInitialSubstring: Word #{0:d} covers existing edge #{1:d} ({2:s})",
                                              wordNum, nextEdge.Id, nextEdge.ToString(wordNum)));
                        active.OriginNode  = nextEdge.ChildNode;
                        active.BeginIndex += numCharsMatched;
                        active.EndIndex    = active.BeginIndex;
                    }
                    endIndex += numCharsMatched;

                    // Set up next iteration of loop
                    curEdge = nextEdge;
                    curNode = curEdge.ChildNode;
                }
            }
 /// <summary>
 ///     Rule #1 (Ukkonen's first group of t_i-transitions): Try to find matching edge for the parent node.
 /// </summary>
 /// <param name="parentNode">This is a member of active.  It is kept separate for clarity.</param>
 private ExtensionResult extendSuffixByRuleOne(
     ref GstSuffix active, ref GstNode parentNode, int endIndex, int wordNum)
 {
     if (active.IsExplicit)
     {
         GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, endIndex));
         if (edge != null && edge.IsSet())
         {
             return(ExtensionResult.Done);
         }
     }
     else    // active suffix is implicit
     {
         GstEdge edge = active.OriginNode.GetChildEdge(GetWordChar(wordNum, active.BeginIndex));
         int     span = active.EndIndex - active.BeginIndex;
         if (edge != null)
         {
             int extantWordNum = edge.GetExtantWordNum();
             if (GetWordChar(extantWordNum, edge.GetBeginIndex(extantWordNum) + span + 1)
                 == GetWordChar(wordNum, endIndex))
             {
                 return(ExtensionResult.Done);
             }
             GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                   "  Rule #1: About to split edge E{0:d} (\"{1:s}\") at suffix {2:s}",
                                   edge.Id, edge.GetText(), active.ToString()));
             parentNode = edge.Split(active);
         }
     }
     return(ExtensionResult.NotDone);
 }
Esempio n. 4
0
 public GstNode(GSuffixTree tree, GstNode suffixNode)
 {
     this.tree       = tree;
     this.id         = tree.NodeCount++;
     this.childEdges = new Dictionary <char, GstEdge>();
     this.suffixNode = suffixNode;
 }
Esempio n. 5
0
 public GstEdge(
     GSuffixTree tree,
     GstNode parentNode,
     int wordNum,
     int beginIndex,
     int endIndex)
     : this(tree, parentNode)
 {
     this.beginIndexes[wordNum] = beginIndex;
     this.endIndexes[wordNum]   = endIndex;
 }
Esempio n. 6
0
 public GstEdge(
     GSuffixTree tree,
     GstNode parentNode)
 {
     this.id           = tree.EdgeCount++;
     this.tree         = tree;
     this.ParentNode   = parentNode;
     this.ChildNode    = new GstNode(tree, null);
     this.beginIndexes = new Dictionary <int, int>();
     this.endIndexes   = new Dictionary <int, int>();
 }
Esempio n. 7
0
            public void MoveFromTo(GstNode oldParentNode, char oldFirstChar, GstNode newParentNode, char newFirstChar)
            {
                GstEdge self = oldParentNode.GetChildEdge(oldFirstChar);

                if (self != this)
                {
                    throw new ArgumentException("Error: MoveTo called with incorrect parent node and/or first char arguments");
                }
                oldParentNode.RemoveChildEdge(oldFirstChar);
                newParentNode.AddChildEdge(newFirstChar, this);
            }
 public GstSuffix(
     GSuffixTree tree,
     GstNode originNode,
     int wordNum,
     int beginIndex,
     int endIndex)
 {
     this.tree = tree;
     this.OriginNode = originNode;
     this.WordNum = wordNum;
     this.beginIndex = beginIndex;
     this.endIndex = endIndex;
 }
 public GstSuffix(
     GSuffixTree tree,
     GstNode originNode,
     int wordNum,
     int beginIndex,
     int endIndex)
 {
     this.tree       = tree;
     this.OriginNode = originNode;
     this.WordNum    = wordNum;
     this.beginIndex = beginIndex;
     this.endIndex   = endIndex;
 }
            /// <summary>
            ///     Rule #2 (Ukkonen's second group of t_i-transitions):
            ///         Create a new edge and add it to the tree at the parent's position.
            //          Part of this is inserting the new edge into the hash table,
            //          and creating a suffix link to the new node from the last one visited.
            /// </summary>
            /// <param name="parentNode">This is a member of active.  It is kept separate for clarity.</param>
            private void extendSuffixByRuleTwo(
                ref GstSuffix active, GstNode parentNode, ref GstNode prevParentNode, int endIndex, int wordNum)
            {
                GstEdge newEdge = new GstEdge(this, parentNode, wordNum, endIndex, GetWord(wordNum).Length - 1);

                newEdge.Add();
                GstUtil.WriteLine(GstVerbosityLevel.Verbose, String.Format(
                                      "  Rule #2: New edge E{0:d} (\"{1:s}\") connects N{2:d} (old parent) to N{3:d} (new child)",
                                      newEdge.Id,
                                      newEdge.GetText(),
                                      newEdge.ParentNode.Id,
                                      newEdge.ChildNode.Id
                                      ));
                setSuffixLink(prevParentNode, parentNode);
                prevParentNode = parentNode;
            }
Esempio n. 11
0
            public GstNode Split(GstSuffix s)
            {
                // Create new edge
                int     wordNum = s.WordNum;
                GstEdge newEdge;

                if (this.HasWordNum(s.WordNum))
                {
                    newEdge = new GstEdge(tree, s.OriginNode, wordNum, GetBeginIndex(wordNum), GetBeginIndex(wordNum) + s.Span);
                }
                else
                {
                    newEdge = new GstEdge(tree, s.OriginNode, wordNum, s.BeginIndex, s.EndIndex);
                }
                foreach (int n in beginIndexes.Keys)
                {
                    newEdge.SetBeginIndex(n, beginIndexes[n]);
                }
                foreach (int n in endIndexes.Keys)
                {
                    newEdge.SetEndIndex(n, beginIndexes[n] + s.Span);
                }
                newEdge.ChildNode.SuffixNode = s.OriginNode;

                char oldFirstChar = GetFirstChar();

                // Modify old edge
                int [] wordNums = beginIndexes.Keys.ToArray();
                foreach (int n in wordNums)
                {
                    IncBeginIndex(n, s.Span + 1);
                }

                // Perform switch
                MoveFromTo(ParentNode, oldFirstChar, newEdge.ChildNode, GetFirstChar());
                ParentNode = newEdge.ChildNode;
                newEdge.Add();
                GstUtil.WriteLine(GstVerbosityLevel.Normal, String.Format(
                                      "  Split E{0:d} into E{1:d} + E{0:d} = \"{2:s}\" + \"{3:s}\"",
                                      Id, newEdge.Id,
                                      newEdge.GetText(),
                                      this.GetText()
                                      ));

                return(newEdge.ChildNode);
            }
            private void extendSuffixes(ref GstSuffix active, int endIndex, int wordNum)
            {
                GstNode parentNode;
                GstNode prevParentNode = null;

                for (   ; ; incrSuffix(ref active, wordNum))
                {
                    parentNode = active.OriginNode;
                    if (extendSuffixByRuleOne(ref active, ref parentNode, endIndex, wordNum)
                        == ExtensionResult.Done)
                    {
                        break;
                    }
                    extendSuffixByRuleTwo(ref active, parentNode, ref prevParentNode, endIndex, wordNum);
                    Debug.Assert(ValidateConsistentEdgeText(this, true));
                }
                setSuffixLink(prevParentNode, parentNode);
                active.EndIndex++;
                active.Canonicalize();
            }