Example #1
0
        /// <summary>
        ///     Traverse the suffix tree from the specified Edge and updates the startIndexes list.
        /// </summary>
        /// <param name="current">Edge to start traversing from.</param>
        /// <param name="length">Length of the edge for which the startIndexes are needed.</param>
        /// <param name="startIndexes">List containing the start indexes.</param>
        private static void DepthFirstIterativeTraversal(
            MultiWaySuffixEdge current,
            long length,
            List<long> startIndexes)
        {
            var stack = new Stack<Tuple<MultiWaySuffixEdge, byte, long>>();

            int childIndex = 0;
            if (current.IsLeaf)
            {
                startIndexes.Add(current.StartIndex - length);
            }
            else
            {
                bool done = false;
                while (!done)
                {
                    bool intermediateEdgeFound = false;
                    int count = current.Children.Length;
                    long currentEdgeLength = current.Children[0].StartIndex - current.StartIndex;
                    for (; childIndex < count; childIndex++)
                    {
                        if (current.Children[childIndex].IsLeaf)
                        {
                            startIndexes.Add(current.Children[childIndex].StartIndex - (length + currentEdgeLength));
                        }
                        else
                        {
                            stack.Push(
                                new Tuple<MultiWaySuffixEdge, byte, long>(current, (byte)(childIndex + 1), length));

                            current = current.Children[childIndex];

                            childIndex = 0;
                            length = currentEdgeLength + length;
                            intermediateEdgeFound = true;
                            break;
                        }
                    }

                    if (!intermediateEdgeFound)
                    {
                        if (stack.Count > 0)
                        {
                            Tuple<MultiWaySuffixEdge, byte, long> item = stack.Pop();
                            current = item.Item1;
                            childIndex = item.Item2;
                            length = item.Item3;
                        }
                        else
                        {
                            done = true;
                        }
                    }
                }
            }
        }
Example #2
0
        /// <summary>
        ///     Initializes a new instance of the MultiWaySuffixTree class with the specified sequence.
        /// </summary>
        /// <param name="sequence">Sequence to build the suffix tree.</param>
        public MultiWaySuffixTree(ISequence sequence)
        {
            if (sequence == null)
            {
                throw new ArgumentNullException("sequence");
            }

            if (sequence.Count == 0)
            {
                throw new ArgumentOutOfRangeException("sequence", Resource.EmptySequence);
            }

            byte[] aliasMap = sequence.Alphabet.GetSymbolValueMap();
            this.uniqueSymbolsInReference = new HashSet<byte>();
            this.uniqueSymbolsStartIndexes = new long[byte.MaxValue + 1];
            var convertedValeus = new byte[sequence.Count];
            for (int index = 0; index < sequence.Count; index++)
            {
                byte symbol = aliasMap[sequence[index]];
                if (!this.uniqueSymbolsInReference.Contains(symbol))
                {
                    this.uniqueSymbolsStartIndexes[symbol] = index;
                    this.uniqueSymbolsInReference.Add(symbol);
                }

                convertedValeus[index] = symbol;
            }

            this.Sequence = sequence;
            this.referenceSequence = new Sequence(sequence.Alphabet, convertedValeus, false);
            this.symbolsCount = sequence.Count;
            this.Name = Resource.MultiWaySuffixTreeName;
            this.MinLengthOfMatch = 20;
            this.NoAmbiguity = false;

            // Create root edge.
            this.rootEdge = new MultiWaySuffixEdge();
            this.edgesCount++;

            this.supportedBaseAlphabet = sequence.Alphabet;

            IAlphabet alphabet;

            while (Alphabets.AlphabetToBaseAlphabetMap.TryGetValue(this.supportedBaseAlphabet, out alphabet))
            {
                this.supportedBaseAlphabet = alphabet;
            }

            // Build the suffix tree.
            this.BuildSuffixTree();

            // Update tree with suffixLinks.
            this.UpdateSuffixLinks();
        }
Example #3
0
        /// <summary>
        ///     Updates the suffix link of a child edge of specified edge.
        /// </summary>
        /// <param name="parenetEdge">Parent edge.</param>
        /// <param name="childIndex">Index of the child to update.</param>
        private void UpdateSuffixLinkForEdge(MultiWaySuffixEdge parenetEdge, int childIndex)
        {
            MultiWaySuffixEdge childEdge = parenetEdge.Children[childIndex];

            long childStartIndex = childEdge.StartIndex;
            long childEndIndex = childEdge.Children[0].StartIndex - 1;
            long childSymbolCount = childEndIndex - childStartIndex + 1;

            MultiWaySuffixEdge parentSuffixLink = parenetEdge.SuffixLink[0];
            int childCount = parentSuffixLink.Children.Length;
            byte symbol = this.referenceSequence[childStartIndex];
            for (int index = 0; index < childCount; index++)
            {
                MultiWaySuffixEdge edge = parentSuffixLink.Children[index];

                // SuffixLinks will point to another intermediate edges only.
                if (edge.IsLeaf)
                {
                    continue;
                }

                long edgeStartIndex = edge.StartIndex;

                if (this.referenceSequence[edgeStartIndex] == symbol)
                {
                    while (true)
                    {
                        long edgeEndIndex = edge.Children[0].StartIndex - 1;

                        // compare the first symbol of the edge.
                        long edgeSymbolCount = edgeEndIndex - edgeStartIndex + 1;
                        if (childSymbolCount == edgeSymbolCount)
                        {
                            childEdge.SuffixLink[0] = edge;
                            return;
                        }
                        childSymbolCount = childSymbolCount - edgeSymbolCount;
                        childStartIndex += edgeSymbolCount;

                        long edgeChildCount = edge.Children.Length;
                        symbol = this.referenceSequence[childStartIndex];
                        for (int edgeChildIndex = 0; edgeChildIndex < edgeChildCount; edgeChildIndex++)
                        {
                            if (this.referenceSequence[edge.Children[edgeChildIndex].StartIndex] == symbol)
                            {
                                // get the child of edge and continue searching.
                                edge = edge.Children[edgeChildIndex];
                                edgeStartIndex = edge.StartIndex;
                                break;
                            }
                        }
                    }
                }
            }
        }
Example #4
0
        /// <summary>
        ///     Gets the intermediate edges present in the path of the match between specified edges for the next query index to
        ///     match.
        /// </summary>
        /// <param name="fromEdge">Edge from the which to search from.</param>
        /// <param name="toedge">Edge where to stop the search.</param>
        /// <param name="matchLengthOfFromEdge">Matching symbols count of the fromEdge.</param>
        /// <param name="lengthToSearch">Length to search.</param>
        /// <param name="nextQueryIndex">Next query index.</param>
        /// <param name="convertedSearchSeq">Converted search sequence.</param>
        /// <param name="minLengthOfMatch">Minimum length of match required.</param>
        /// <returns>Returns the intermediate edges found between the fromEdge to toEdge.</returns>
        private Stack<EdgesFound> GetIntermediateEdges(
            MultiWaySuffixEdge fromEdge,
            MultiWaySuffixEdge toedge,
            long matchLengthOfFromEdge,
            long lengthToSearch,
            long nextQueryIndex,
            ISequence convertedSearchSeq,
            long minLengthOfMatch)
        {
            var edgesFoundForNextQueryIndex = new Stack<EdgesFound>();
            var edge = new MultiWaySuffixEdge();
            long edgeStartIndex = 0;

            if (toedge.IsLeaf || fromEdge.IsLeaf)
            {
                return edgesFoundForNextQueryIndex;
            }

            matchLengthOfFromEdge--;
            MultiWaySuffixEdge previousIntermediateEdge = fromEdge.SuffixLink[0];
            long childIndexToStop = toedge.SuffixLink[0].StartIndex;
            long searchIndex = nextQueryIndex + matchLengthOfFromEdge;
            int childCount = previousIntermediateEdge.Children.Length;

            // if the previousIntermediateEdge is rootEdge.
            if (previousIntermediateEdge.StartIndex == -1 && lengthToSearch > 0)
            {
                lengthToSearch--;
            }

            if (lengthToSearch > 0)
            {
                byte querySymbol = convertedSearchSeq[searchIndex];
                for (int index = 0; index < childCount; index++)
                {
                    edge = previousIntermediateEdge.Children[index];

                    edgeStartIndex = edge.StartIndex;

                    byte refSymbol = edgeStartIndex < this.symbolsCount
                                         ? this.referenceSequence[edgeStartIndex]
                                         : TerminatingSymbol;
                    if (refSymbol == querySymbol)
                    {
                        break;
                    }
                }

                // When lengthOfMatchInEdge >0 there will be an edge from the previousIntermediateEdge.
                while (!edge.IsLeaf && edge.StartIndex != childIndexToStop)
                {
                    long edgeEndIndex = edge.Children[0].StartIndex - 1;

                    // compare the first symbol of the edge.
                    long edgeSymbolCount = edgeEndIndex - edgeStartIndex + 1;
                    if (lengthToSearch == edgeSymbolCount)
                    {
                        searchIndex += lengthToSearch;
                        lengthToSearch = 0;
                        previousIntermediateEdge = edge;
                        break;
                    }
                    if (lengthToSearch > edgeSymbolCount)
                    {
                        lengthToSearch -= edgeSymbolCount;
                        searchIndex += edgeSymbolCount;
                        matchLengthOfFromEdge += edgeSymbolCount;
                        long edgeChildCount = edge.Children.Length;

                        querySymbol = convertedSearchSeq[searchIndex];

                        for (int edgeChildIndex = 0; edgeChildIndex < edgeChildCount; edgeChildIndex++)
                        {
                            if (this.referenceSequence[edge.Children[edgeChildIndex].StartIndex] == querySymbol)
                            {
                                // get the child of edge and continue searching.
                                previousIntermediateEdge = edge;
                                if (matchLengthOfFromEdge >= minLengthOfMatch)
                                {
                                    edgesFoundForNextQueryIndex.Push(
                                        new EdgesFound
                                            {
                                                Edge = previousIntermediateEdge,
                                                LengthOfMatch = matchLengthOfFromEdge
                                            });
                                }

                                edge = edge.Children[edgeChildIndex];
                                edgeStartIndex = edge.StartIndex;
                                break;
                            }
                        }
                    }
                    else
                    {
                        break;
                    }
                }
            }

            return edgesFoundForNextQueryIndex;
        }
Example #5
0
        /// <summary>
        ///     Builds the suffix tree.
        /// </summary>
        private void BuildSuffixTree()
        {
            int arraySize = this.uniqueSymbolsInReference.Max() + 1;
            var parentRootForSymbol = new MultiWaySuffixEdge[arraySize];

            Parallel.ForEach(
                this.uniqueSymbolsInReference,
                symbol =>
                    {
                        var edge = new MultiWaySuffixEdge();

                        for (long index = this.uniqueSymbolsStartIndexes[symbol]; index < this.symbolsCount; index++)
                        {
                            byte symbolAtIndex = this.referenceSequence[index];

                            if (symbol != symbolAtIndex)
                            {
                                continue;
                            }

                            MultiWaySuffixEdge parent = parentRootForSymbol[symbol];

                            long startIndex = index;
                            MultiWaySuffixEdge[] arrayConainingParent = null;
                            int indexOfArrayContainingParent = -1;
                            bool continueInsert = true;
                            bool duplicatedConsecutiveSymbolsFound = true;

                            do
                            {
                                byte symbolAtStartIndex = TerminatingSymbol;
                                if (startIndex < this.symbolsCount)
                                {
                                    symbolAtStartIndex = this.referenceSequence[startIndex];
                                }

                                int indexOfEdgeFound = -1;
                                int childCount = 0;
                                if (!parent.IsLeaf)
                                {
                                    // Find edge start
                                    childCount = parent.Children.Length;
                                    for (int i = 0; i < childCount; i++)
                                    {
                                        MultiWaySuffixEdge childEdge = parent.Children[i];
                                        if (childEdge.StartIndex < this.symbolsCount)
                                        {
                                            byte edgeSymbol = this.referenceSequence[childEdge.StartIndex];
                                            if (edgeSymbol == symbolAtStartIndex)
                                            {
                                                edge = childEdge;
                                                indexOfEdgeFound = i;
                                                startIndex++;
                                                if (edgeSymbol != symbolAtIndex)
                                                {
                                                    duplicatedConsecutiveSymbolsFound = false;
                                                }

                                                break;
                                            }
                                        }
                                    }
                                }

                                MultiWaySuffixEdge newEdge;
                                if (indexOfEdgeFound == -1)
                                {
                                    // Insert new child
                                    newEdge = new MultiWaySuffixEdge(startIndex);

                                    Array.Resize(ref parent.Children, childCount + 1);

                                    parent.Children[childCount] = newEdge;
                                    parent.SuffixLink = new MultiWaySuffixEdge[1];
                                    Interlocked.Increment(ref this.edgesCount);

                                    // Assign back modified edge.
                                    if (arrayConainingParent == null)
                                    {
                                        parentRootForSymbol[symbol] = parent;
                                    }
                                    else
                                    {
                                        arrayConainingParent[indexOfArrayContainingParent] = parent;
                                    }

                                    continueInsert = false;
                                    break;
                                }
                                long edgeEndIndex = this.symbolsCount;

                                if (!edge.IsLeaf)
                                {
                                    // return the minimum start index of children -1
                                    edgeEndIndex = edge.Children[0].StartIndex - 1;
                                }

                                // Do not enter if only one symbol is there in the edge.
                                if (edge.StartIndex < edgeEndIndex)
                                {
                                    long duplicatedConsicutiveSymbolsCount = 0;

                                    for (long counter = edge.StartIndex + 1;
                                         counter <= edgeEndIndex;
                                         counter++, startIndex++)
                                    {
                                        symbolAtStartIndex = TerminatingSymbol;
                                        if (startIndex < this.symbolsCount)
                                        {
                                            symbolAtStartIndex = this.referenceSequence[startIndex];
                                        }

                                        byte symbolAtCounter = TerminatingSymbol;
                                        if (counter < this.symbolsCount)
                                        {
                                            symbolAtCounter = this.referenceSequence[counter];
                                        }

                                        if (symbolAtStartIndex != symbolAtCounter)
                                        {
                                            // Split the edge
                                            // Create the new edge
                                            // Copy the children of old edge to new edge
                                            newEdge = new MultiWaySuffixEdge(counter)
                                                          {
                                                              Children = edge.Children,
                                                              SuffixLink = edge.SuffixLink
                                                          };

                                            edge.Children = new MultiWaySuffixEdge[2]; // for split edge and leaf edge.

                                            // As this is an internal node allocate the array here itself to avoid updating 
                                            // the parent array with the new address of the edge (as MultiWaySuffixEdge is a value type).
                                            edge.SuffixLink = new MultiWaySuffixEdge[1];
                                            edge.SuffixLink[0].StartIndex = -1;

                                            // Create leaf edge.
                                            var leafEdge = new MultiWaySuffixEdge(startIndex);

                                            if (duplicatedConsecutiveSymbolsFound
                                                && duplicatedConsicutiveSymbolsCount > 1)
                                            {
                                                for (int duplicatedIndex = 1;
                                                     duplicatedIndex < duplicatedConsicutiveSymbolsCount;
                                                     duplicatedIndex++)
                                                {
                                                    var duplicateSymbolEdge =
                                                        new MultiWaySuffixEdge(newEdge.StartIndex - 1);
                                                    duplicateSymbolEdge.Children = new MultiWaySuffixEdge[2];
                                                    duplicateSymbolEdge.SuffixLink = new MultiWaySuffixEdge[1];
                                                    duplicateSymbolEdge.SuffixLink[0].StartIndex = -1;

                                                    duplicateSymbolEdge.Children[0] = newEdge;
                                                    duplicateSymbolEdge.Children[1] = leafEdge;

                                                    // we are adding two edges here - duplicatesymbol edge and leaf edge.
                                                    // leaf edge will be duplicated.
                                                    Interlocked.Increment(ref this.edgesCount);
                                                    Interlocked.Increment(ref this.edgesCount);

                                                    newEdge = duplicateSymbolEdge;
                                                }

                                                index += duplicatedConsicutiveSymbolsCount - 1;
                                            }

                                            // Update the old edge

                                            // Set new edge as child edge to old edge
                                            edge.Children[0] = newEdge;

                                            // Add the leaf edge.
                                            edge.Children[1] = leafEdge;
                                            Interlocked.Increment(ref this.edgesCount);
                                            Interlocked.Increment(ref this.edgesCount);

                                            // assign back edge that got modified.
                                            parent.Children[indexOfEdgeFound] = edge;

                                            continueInsert = false;
                                            duplicatedConsecutiveSymbolsFound = false;
                                            break;
                                        }

                                        if (duplicatedConsecutiveSymbolsFound)
                                        {
                                            duplicatedConsicutiveSymbolsCount++;

                                            if (symbolAtIndex != symbolAtStartIndex)
                                            {
                                                duplicatedConsecutiveSymbolsFound = false;
                                            }
                                        }
                                    }
                                }

                                if (continueInsert)
                                {
                                    arrayConainingParent = parent.Children;
                                    indexOfArrayContainingParent = indexOfEdgeFound;
                                    parent = edge;
                                }
                            }
                            while ((startIndex <= this.symbolsCount) && continueInsert);
                        }
                    });

            this.rootEdge.StartIndex = -1;
            int rootChildrenCount = this.uniqueSymbolsInReference.Count + 1;

            Array.Resize(ref this.rootEdge.Children, rootChildrenCount);

            int rootChildIndex = 0;

            // Add all symbol root's child to the rootEdge.
            foreach (byte symbol in this.uniqueSymbolsInReference)
            {
                this.rootEdge.Children[rootChildIndex] = parentRootForSymbol[symbol].Children[0];
                rootChildIndex++;
            }

            // Add edge for $.
            this.rootEdge.Children[rootChildrenCount - 1] = new MultiWaySuffixEdge(this.symbolsCount);
            this.edgesCount++;
        }
Example #6
0
 public void ValidateEdgesForALeaf()
 {
     MultiWaySuffixEdge rootEdge = new MultiWaySuffixEdge();
     Assert.AreEqual(rootEdge.IsLeaf, true);
     ApplicationLog.WriteLine("MUMmer BVT : Successfully Validated Is leaf property.");
     Assert.AreEqual(rootEdge.Children, null);
     ApplicationLog.WriteLine("MUMmer BVT : Successfully Validated Children property for a Leaf. ");
     Assert.AreEqual(rootEdge.StartIndex, 0);
     ApplicationLog.WriteLine("MUMmer BVT : Successfully Validated start index of a Leaf.");
 }