Ejemplo n.º 1
0
        /// <summary>
        /// Find the edge leading out of the node
        /// </summary>
        /// <param name="node">Start node</param>
        /// <param name="character">Starting character of the edge</param>
        /// <param name="tree">Suffix Tree</param>
        /// <returns>Edge leading out from the node</returns>
        private Edge Find(int node, int character, SequenceSuffixTree tree)
        {
            Edge edge      = null;
            Edge edgeFound = null;

            // Find the position of required node
            int position = Hash(node, character);

            // If the node exists in Edge list
            // Find the edge and return it
            while (tree.Edges.TryGetValue(position, out edgeFound))
            {
                if (((edgeFound.StartNode == node) &&
                     (character == _referenceString[edgeFound.StartIndex])) ||
                    (-1 == edgeFound.StartNode))
                {
                    edge = edgeFound;
                    break;
                }

                position = ++position % _maxHashKey;
            }

            return(edge);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Builds the Suffix Tree using Kurtz Algorithm(using Hash Table)
        /// </summary>
        /// <example>
        /// --------------------------------------------------
        /// Create the Sequence from string (let say DNA sequence "CACCAS")
        /// --------------------------------------------------
        /// string aOriginalStr = "CACCAS";
        /// Sequence aInput = new Sequence(Alphabets.DNA, aOriginalStr);
        /// --------------------------------------------------
        /// Instantiate and run the suffix tree builder
        /// --------------------------------------------------
        /// ISuffixTreeBuilder suffixTreeBuilder = new KurtzSuffixTreeBuilder();
        /// SuffixTree suffixTree = suffixTreeBuilder.BuildSuffixTree(aInput);
        /// </example>
        /// <param name="sequence">Input Sequence</param>
        /// <returns>Suffix Tree</returns>
        public ISuffixTree BuildSuffixTree(ISequence sequence)
        {
            if (sequence == null)
            {
                throw new ArgumentNullException("sequence");
            }

            ValidateReferenceSequence(sequence);

            // Initialize
            Edge.NodeCount = 1;
            InitializeReferenceSequence(sequence);

            // Create Tasks
            Dictionary <byte, IList <int> > treeTasks = new Dictionary <byte, IList <int> >();

            // Loop through subset of sequence string and build the suffix tree
            // this will loop through the sequence once and collect all the indexes needed.
            for (int index = 0; index < ReferenceLength; index++)
            {
                IList <int> startIndices = null;

                if (!treeTasks.TryGetValue(GetReferenceSymbol(index), out startIndices))
                {
                    startIndices = new List <int>();
                    treeTasks.Add(GetReferenceSymbol(index), startIndices);
                }

                startIndices.Add(index);
            }

            IList <Task <SequenceSuffixTree> > tasks = treeTasks.Values.Select(
                indices => Task <SequenceSuffixTree> .Factory.StartNew(
                    t => AppendSuffix(indices, sequence), TaskCreationOptions.None)).ToList();

            // Wait for all the task
            Task.WaitAll(tasks.ToArray());

            // Merge the branches of tree
            _suffixTree = new SequenceSuffixTree(sequence, tasks.Sum(task => task.Result.Edges.Count));
            Edge edgeFound = null;

            foreach (Task <SequenceSuffixTree> task in tasks)
            {
                foreach (KeyValuePair <int, Edge> edge in task.Result.Edges)
                {
                    if (_suffixTree.Edges.TryGetValue(edge.Key, out edgeFound))
                    {
                        Insert(edgeFound, _suffixTree);
                    }
                    else
                    {
                        _suffixTree.Edges.Add(edge.Key, edge.Value);
                    }
                }
            }

            // return the suffix tree
            return(_suffixTree);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Add suffix to the tree, the loop inside will break under two conditions
        ///     1. If you have reached the leaf node
        ///     2. If you have reached end of suffix
        /// </summary>
        /// <param name="startIndices">List of index of the first character of suffix</param>
        /// <param name="sequence">Reference sequence</param>
        /// <returns>Suffix tree</returns>
        private SequenceSuffixTree AppendSuffix(IList <int> startIndices, ISequence sequence)
        {
            SequenceSuffixTree tree = new SequenceSuffixTree(sequence);

            foreach (int index in startIndices)
            {
                int  startIndex     = index;
                int  parentNode     = 0;
                Edge edge           = null;
                bool continueInsert = true;

                do
                {
                    edge = Find(parentNode, _referenceString[startIndex], tree);

                    if (null == edge)
                    {
                        edge = new Edge(startIndex,
                                        _referenceString.Length - 1,
                                        parentNode);

                        Insert(edge, tree);

                        continueInsert = false;
                        break;
                    }
                    else
                    {
                        startIndex++;

                        if (edge.StartIndex < edge.EndIndex)
                        {
                            for (int counter = edge.StartIndex + 1; counter <= edge.EndIndex; counter++)
                            {
                                if (_referenceString[startIndex] != _referenceString[counter])
                                {
                                    parentNode = SplitEdge(edge, counter - 1, parentNode, tree);

                                    // Add the leaf edge
                                    Edge newEdge = new Edge(startIndex,
                                                            _referenceString.Length - 1,
                                                            parentNode);

                                    Insert(newEdge, tree);
                                    continueInsert = false;
                                    break;
                                }

                                startIndex++;
                            }
                        }

                        parentNode = edge.EndNode;
                    }
                } while (startIndex < _referenceString.Length && continueInsert);
            }

            return(tree);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Remove the given edge from the tree and fill the gap
        /// </summary>
        /// <param name="edge">Edge that has to be removed</param>
        /// <param name="tree">Suffix Tree</param>
        private void Remove(Edge edge, SequenceSuffixTree tree)
        {
            Edge edgeFound = null;

            int position = Hash(
                edge.StartNode,
                _referenceString[edge.StartIndex]);

            while (tree.Edges[position].StartNode != edge.StartNode ||
                   tree.Edges[position].StartIndex != edge.StartIndex)
            {
                position = ++position % _maxHashKey;
            }

            // loop through the branch is broken and move them to fill the gap
            while (tree.Edges.TryGetValue(position, out edgeFound))
            {
                edgeFound.StartNode = -1;
                int tempPosition = position;
                while (tree.Edges.TryGetValue(position, out edgeFound))
                {
                    position = ++position % _maxHashKey;
                    if (!tree.Edges.TryGetValue(position, out edgeFound))
                    {
                        return;
                    }

                    if (-1 == edgeFound.StartNode)
                    {
                        return;
                    }

                    int symbol = _referenceString[edgeFound.StartIndex];

                    int nextPosition = Hash(edgeFound.StartNode, symbol);

                    if (position >= nextPosition && nextPosition > tempPosition)
                    {
                        continue;
                    }

                    if (nextPosition > tempPosition && tempPosition > position)
                    {
                        continue;
                    }

                    if (tempPosition > position && position >= nextPosition)
                    {
                        continue;
                    }

                    break;
                }

                tree.Edges[tempPosition] = new Edge(edgeFound);
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Splitting the edge for creation of new node
        ///     Remove the edge at given position
        ///     Add new edge to the parent of removed edge
        ///     Add the removed edge to the new edge (requires start character to be recalculated)
        /// </summary>
        /// <param name="edge">Edge of tree which has to be split</param>
        /// <param name="splitAt">Split edge at index</param>
        /// <param name="parentNode">Parent node of new edge</param>
        /// <param name="tree">Suffix Tree</param>
        /// <returns>Value of new edge End node</returns>
        private int SplitEdge(Edge edge, int splitAt, int parentNode, SequenceSuffixTree tree)
        {
            Remove(edge, tree);
            Edge newEdge = new Edge(
                edge.StartIndex,
                splitAt,
                parentNode);

            newEdge.IsLeaf = false;
            Insert(newEdge, tree);

            edge.StartIndex = splitAt + 1;
            edge.StartNode  = newEdge.EndNode;
            Insert(edge, tree);

            return(newEdge.EndNode);
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Insert an Edge into Hash Table, if not already in the list
        /// </summary>
        /// <param name="edge">Edge that has to be inserted</param>
        /// <param name="tree">Suffix Tree</param>
        private void Insert(Edge edge, SequenceSuffixTree tree)
        {
            Edge edgeFound = null;

            int position = Hash(
                edge.StartNode,
                _referenceString[edge.StartIndex]);

            if (tree.Edges.TryGetValue(position, out edgeFound))
            {
                // If the hash already exists in tree, increment the edge till a position which
                // does not exist in tree is reached
                while (-1 != edgeFound.StartNode)
                {
                    position = ++position % _maxHashKey;
                    if (!tree.Edges.TryGetValue(position, out edgeFound))
                    {
                        break;
                    }
                }
            }

            tree.Edges[position] = edge;
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Find the matches of sequence in suffix tree
        /// </summary>
        /// <param name="suffixTree">Suffix Tree</param>
        /// <param name="searchSequence">Query searchSequence</param>
        /// <param name="lengthOfMUM">Mininum length of MUM</param>
        /// <returns>Matches found</returns>
        public IList <MaxUniqueMatch> FindMatches(
            SequenceSuffixTree suffixTree,
            ISequence searchSequence,
            long lengthOfMUM)
        {
            if (suffixTree == null)
            {
                throw new ArgumentNullException("suffixTree");
            }

            if (searchSequence == null)
            {
                throw new ArgumentNullException("searchSequence");
            }

            // Initialize
            _referenceString    = string.Empty;
            _minimumLengthOfMUM = lengthOfMUM;
            _suffixTree         = suffixTree;
            _searchSequence     = searchSequence;
            _queryString        = _searchSequence.ToString();
            SegmentedSequence referenceSequence = _suffixTree.Sequence as SegmentedSequence;

            if (null != referenceSequence)
            {
                foreach (Sequence sequence in referenceSequence.Sequences)
                {
                    _referenceString += sequence.ToString() + CONCATENATING_SYMBOL;
                }

                // remove the concatenating symbol form end and add terminating symbol
                _referenceString  = _referenceString.TrimEnd(CONCATENATING_SYMBOL);
                _referenceString += TERMINATING_SYMBOL;
            }
            else
            {
                _referenceString = _suffixTree.Sequence.ToString() + TERMINATING_SYMBOL;
            }

            int interval = (int)(_queryString.Length - (_minimumLengthOfMUM - 1)) / Environment.ProcessorCount;

            if (interval < 1)
            {
                interval = 1;
            }

            IList <Task <List <MaxUniqueMatch> > > result = new List <Task <List <MaxUniqueMatch> > >();

            for (int index = 0; index < _queryString.Length - (_minimumLengthOfMUM - 1); index += interval)
            {
                int taskIndex = index;
                result.Add(
                    Task.Factory.StartNew <List <MaxUniqueMatch> >(
                        o => FindMUMs(taskIndex, interval),
                        TaskCreationOptions.None));
            }

            List <MaxUniqueMatch> mergedList = new List <MaxUniqueMatch>();

            foreach (List <MaxUniqueMatch> local in result.Select(l => l.Result))
            {
                // Check if there is overlap, last MUM of mergedList overlaps with first MUM of local
                if (0 == mergedList.Count)
                {
                    mergedList.AddRange(local.Select(m => m));
                }
                else
                {
                    if (0 < local.Count)
                    {
                        MaxUniqueMatch previous = mergedList.Last();
                        MaxUniqueMatch current  = local.First();

                        if ((current.SecondSequenceStart >= previous.SecondSequenceStart &&
                             current.SecondSequenceStart <= previous.SecondSequenceStart + previous.Length) &&
                            (current.SecondSequenceStart + current.Length >= previous.SecondSequenceStart &&
                             current.SecondSequenceStart + current.Length <= previous.SecondSequenceStart + previous.Length))
                        {
                            local.RemoveAt(0);
                        }

                        if (0 < local.Count)
                        {
                            mergedList.AddRange(local.Select(m => m));
                        }
                    }
                }
            }
            // Order the mum list with query sequence order
            for (int index = 0; index < mergedList.Count; index++)
            {
                mergedList[index].FirstSequenceMumOrder  = index + 1;
                mergedList[index].SecondSequenceMumOrder = index + 1;
            }

            return(mergedList);
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Builds the Suffix Tree using Kurtz Algorithm(using Hash Table)
        /// </summary>
        /// <example>
        /// --------------------------------------------------
        /// Create the Sequence from string (let say DNA sequence "CACCAS")
        /// --------------------------------------------------
        /// string aOriginalStr = "CACCAS";
        /// Sequence aInput = new Sequence(Alphabets.DNA, aOriginalStr);
        /// --------------------------------------------------
        /// Instantiate and run the suffix tree builder
        /// --------------------------------------------------
        /// ISuffixTreeBuilder suffixTreeBuilder = new KurtzSuffixTreeBuilder();
        /// SuffixTree suffixTree = suffixTreeBuilder.BuildSuffixTree(aInput);
        /// </example>
        /// <param name="sequence">Input Sequence</param>
        /// <returns>Suffix Tree</returns>
        public SequenceSuffixTree BuildSuffixTree(ISequence sequence)
        {
            // Initialize
            Edge.NodeCount = 1;
            SegmentedSequence referenceSequence = sequence as SegmentedSequence;

            if (null != referenceSequence)
            {
                foreach (Sequence subSequence in referenceSequence.Sequences)
                {
                    _referenceString += subSequence.ToString() + CONCATENATING_SYMBOL;
                }

                // remove the concatenating symbol form end and add terminating symbol
                _referenceString  = _referenceString.TrimEnd(CONCATENATING_SYMBOL);
                _referenceString += TERMINATING_SYMBOL;
            }
            else
            {
                _referenceString = sequence.ToString() + TERMINATING_SYMBOL;
            }

            _suffixTree = new SequenceSuffixTree(sequence);

            // Create Tasks
            Dictionary <char, IList <int> > treeTasks = new Dictionary <char, IList <int> >();

            // Loop through subset of sequence string and build the suffix tree
            for (int index = 0; index < _referenceString.Length; index++)
            {
                IList <int> startIndices = null;

                if (!treeTasks.TryGetValue(_referenceString[index], out startIndices))
                {
                    startIndices = new List <int>();
                    treeTasks.Add(_referenceString[index], startIndices);
                }

                startIndices.Add(index);
            }

            IList <Task <SequenceSuffixTree> > tasks = treeTasks.Values.Select(
                indices => Task <SequenceSuffixTree> .Factory.StartNew(
                    t => AppendSuffix(indices, sequence), TaskCreationOptions.None)).ToList();

            // Wait for all the task
            Task.WaitAll(tasks.ToArray());

            // Merge the branches of tree
            Edge edgeFound = null;

            foreach (Task <SequenceSuffixTree> task in tasks)
            {
                foreach (KeyValuePair <int, Edge> edge in task.Result.Edges)
                {
                    if (_suffixTree.Edges.TryGetValue(edge.Key, out edgeFound))
                    {
                        Insert(edgeFound, _suffixTree);
                    }
                    else
                    {
                        _suffixTree.Edges.Add(edge.Key, edge.Value);
                    }
                }
            }

            // return the suffix tree
            return(_suffixTree);
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Find the matches of sequence in suffix tree
        /// </summary>
        /// <param name="suffixTree">Suffix tree to searh on</param>
        /// <param name="searchSequence">query sequence to find matches</param>
        /// <param name="lengthOfMUM">Minimum length of the match</param>
        /// <returns>Matches found</returns>
        private IList <MaxUniqueMatch> FindMatchWithOption(
            ISuffixTree suffixTree,
            ISequence searchSequence,
            long lengthOfMUM)
        {
            if (suffixTree == null)
            {
                throw new ArgumentNullException("suffixTree");
            }

            if (searchSequence == null)
            {
                throw new ArgumentNullException("searchSequence");
            }
            ValidateSequence(suffixTree.Sequence, searchSequence);

            // Initialize
            _minimumLengthOfMUM = lengthOfMUM;
            _suffixTree         = suffixTree as SequenceSuffixTree;
            InitializeReferenceSequence(suffixTree.Sequence);
            InitializeQuerySequence(searchSequence);

            int interval = (int)(_querySequence.Count - (_minimumLengthOfMUM - 1)) / Environment.ProcessorCount;

            if (interval < 1)
            {
                interval = 1;
            }

            IList <Task <List <MaxUniqueMatch> > > result = new List <Task <List <MaxUniqueMatch> > >();

            for (int index = 0; index < _querySequence.Count - (_minimumLengthOfMUM - 1); index += interval)
            {
                int taskIndex = index;
                result.Add(
                    Task.Factory.StartNew <List <MaxUniqueMatch> >(
                        o => FindMUMs(taskIndex, interval),
                        TaskCreationOptions.None));
            }

            List <MaxUniqueMatch> mergedList = new List <MaxUniqueMatch>();

            foreach (List <MaxUniqueMatch> local in result.Select(l => l.Result))
            {
                // Check if there is overlap, last MUM of mergedList overlaps with first MUM of local
                if (0 == mergedList.Count)
                {
                    mergedList.AddRange(local.Select(m => m));
                }
                else
                {
                    if (0 < local.Count)
                    {
                        MaxUniqueMatch previous = mergedList.Last();
                        MaxUniqueMatch current  = local.First();

                        if ((current.SecondSequenceStart >= previous.SecondSequenceStart &&
                             current.SecondSequenceStart <= previous.SecondSequenceStart + previous.Length) &&
                            (current.SecondSequenceStart + current.Length >= previous.SecondSequenceStart &&
                             current.SecondSequenceStart + current.Length <= previous.SecondSequenceStart + previous.Length))
                        {
                            local.RemoveAt(0);
                        }

                        if (0 < local.Count)
                        {
                            mergedList.AddRange(local.Select(m => m));
                        }
                    }
                }
            }
            // Order the mum list with query sequence order
            for (int index = 0; index < mergedList.Count; index++)
            {
                mergedList[index].FirstSequenceMumOrder  = index + 1;
                mergedList[index].SecondSequenceMumOrder = index + 1;
            }

            return(mergedList);
        }