Exemplo n.º 1
0
        public ISuffixTree BuildSuffixTree(ISequence sequence)
        {
            if (sequence == null)
            {
                throw new ArgumentNullException("sequence");
            }

            // Initialize
            ValidateReferenceSequence(sequence);
            InitializeReferenceSequence(sequence);

            // Create Tasks
            Dictionary <byte, IList <int> > treeTasks = new Dictionary <byte, IList <int> >();

            // Loop through subset of sequence string and build the suffix tree
            // this will loop through the sequence once and collect all the indexes needed.
            for (int index = 0; index < ReferenceLength; index++)
            {
                IList <int> startIndices = null;

                if (!treeTasks.TryGetValue(GetReferenceSymbol(index), out startIndices))
                {
                    startIndices = new List <int>();
                    treeTasks.Add(GetReferenceSymbol(index), startIndices);
                }

                startIndices.Add(index);
            }

            _distinctSymbolCount = treeTasks.Count;
            if (EdgeStorage == null)
            {
                EdgeStorage = new FileSuffixEdgeStorage();
            }

            // Create Tasks
            IList <Task <IMultiWaySuffixTree> > tasks = treeTasks.Values.Select(
                indices => Task <IMultiWaySuffixTree> .Factory.StartNew(
                    t => AppendSuffix(indices), TaskCreationOptions.None)).ToList();

            // Wait for all the task
            Task.WaitAll(tasks.ToArray());

            _suffixTree = CreateSuffixTree();

            // Merge the branches of tree
            foreach (Task <IMultiWaySuffixTree> task in tasks)
            {
                _suffixTree.Merge(task.Result);
            }

            // return the suffix tree
            return(_suffixTree);
        }
Exemplo n.º 2
0
        private IMultiWaySuffixTree CreateSuffixTree()
        {
            // Calculate the memory requirement of suffix tree 2*N*Size of each edge
            long edgeSize     = 0;
            int  instanceSize = 0;
            int  arraySize    = 0;

            // Size of a reference (children)
            if (Environment.Is64BitProcess)
            {
                instanceSize = 16;
                arraySize    = 32;
            }
            else
            {
                instanceSize = 12;
                arraySize    = 16;
            }

            // Two Integers (StartIndex & EndIndex)
            edgeSize = 2 * sizeof(int) + instanceSize + arraySize;

            long   requiredCapacity = edgeSize * 2 * ReferenceLength;
            double memoryCapacity   = GetPhysicalMemory();

            IMultiWaySuffixTree suffixTree = null;

            if (requiredCapacity < memoryCapacity)
            {
                suffixTree = new MultiWaySuffixTree(_referenceSequences, _distinctSymbolCount);
            }
            else
            {
                // One long (Key)
                edgeSize        += sizeof(long);
                requiredCapacity = edgeSize * 2 * ReferenceLength;

                if (PersistenceThreshold == -1)
                {
                    PersistenceThreshold = (int)(memoryCapacity / (edgeSize * _distinctSymbolCount));
                }

                suffixTree = new PersistentMultiWaySuffixTree(
                    _referenceSequences,
                    _distinctSymbolCount,
                    PersistenceThreshold,
                    EdgeStorage);
            }

            return(suffixTree);
        }
Exemplo n.º 3
0
        /// <summary>
        /// Add suffix to the tree, the loop inside will break under two conditions
        ///     1. If you have reached the leaf node
        ///     2. If you have reached end of suffix
        /// </summary>
        /// <param name="startIndices">List of index of the first character of suffix</param>
        /// <returns>Suffix tree</returns>
        private IMultiWaySuffixTree AppendSuffix(IList <int> startIndices)
        {
            IMultiWaySuffixTree tree = CreateSuffixTree();

            // Loop through subset of sequence string and build the suffix tree
            foreach (int index in startIndices)
            {
                int   startIndex     = index;
                IEdge parentEdge     = tree.Root;
                IEdge edge           = null;
                bool  continueInsert = true;

                do
                {
                    edge = tree.Find(parentEdge, GetReferenceSymbol(startIndex));

                    if (null == edge)
                    {
                        tree.Insert(parentEdge, startIndex, ReferenceLength - 1);
                        continueInsert = false;
                        break;
                    }
                    else
                    {
                        startIndex++;

                        if (edge.StartIndex < edge.EndIndex)
                        {
                            for (int counter = edge.StartIndex + 1; counter <= edge.EndIndex; counter++)
                            {
                                if (GetReferenceSymbol(startIndex) != GetReferenceSymbol(counter))
                                {
                                    parentEdge = tree.Split(edge, counter - 1);

                                    // Add the leaf edge
                                    tree.Insert(parentEdge, startIndex, ReferenceLength - 1);
                                    continueInsert = false;
                                    break;
                                }

                                startIndex++;
                            }
                        }

                        parentEdge = edge;
                    }
                } while (startIndex < ReferenceLength && continueInsert);
            }

            return(tree);
        }
Exemplo n.º 4
0
        public void TestInMemorySimpleSequence()
        {
            string   sequenceString = "BANANA";
            Sequence sequence       = new Sequence(Alphabets.Protein, sequenceString);

            using (SimpleSuffixTreeBuilder simpleSuffixTreeBuilder = new SimpleSuffixTreeBuilder())
            {
                IMultiWaySuffixTree inMemorySuffixTree = simpleSuffixTreeBuilder.BuildSuffixTree(sequence) as IMultiWaySuffixTree;

                // Verify the edges in Suffix Tree
                Assert.AreEqual(7, inMemorySuffixTree.Count);

                // Verify the sequence in Suffix Tree
                Assert.AreEqual(inMemorySuffixTree.Sequence.ToString(), sequenceString);
            }
        }
Exemplo n.º 5
0
        public void TestPersistentSimpleSequence()
        {
            string   sequenceString = "BANANA";
            Sequence sequence       = new Sequence(Alphabets.Protein, sequenceString);

            using (SimpleSuffixTreeBuilder simpleSuffixTreeBuilder = new SimpleSuffixTreeBuilder())
            {
                simpleSuffixTreeBuilder.PersistenceThreshold = 0;

                IMultiWaySuffixTree persistentSuffixTree = simpleSuffixTreeBuilder.BuildSuffixTree(sequence) as IMultiWaySuffixTree;

                // Verify the edges in Suffix Tree
                Assert.AreEqual(7, persistentSuffixTree.Count);

                // Verify the sequence in Suffix Tree
                Assert.AreEqual(persistentSuffixTree.Sequence.ToString(), sequenceString);
            }
        }
Exemplo n.º 6
0
        /// <summary>
        /// Merge the given branch at the root of Suffix Tree.
        /// Asummption:
        ///  The root node of the given branch contains only one edge, which is the branch to be merged.
        /// </summary>
        /// <param name="branch">Branch to be merged.</param>
        /// <returns>Success flag.</returns>
        public bool Merge(IMultiWaySuffixTree branch)
        {
            if (branch == null)
            {
                throw new NotImplementedException("branch");
            }

            MultiWaySuffixEdge mwBranchEdge = branch.Root as MultiWaySuffixEdge;

            if (mwBranchEdge.GetChildren() == null)
            {
                return(false);
            }

            MultiWaySuffixEdge mwRoot = Root as MultiWaySuffixEdge;

            if (mwRoot.GetChildren() == null)
            {
                mwRoot.AddChild(mwBranchEdge.GetChildren()[0]);
                Count += (branch.Count - 1); // - the original root edge of branch
                return(true);
            }

            if (mwRoot.GetChildren().Length < _maximumChildrenCount)
            {
                mwRoot.AddChild(mwBranchEdge.GetChildren()[0]);
                Count += (branch.Count - 1); // - the original root edge of branch
                return(true);
            }

            // No more children edge can be added.
            throw new InvalidOperationException(string.Format(
                                                    CultureInfo.CurrentCulture,
                                                    "Cannot add more than {0} child nodes to edge.",
                                                    _maximumChildrenCount));
        }
Exemplo n.º 7
0
        /// <summary>
        /// Find the matches of sequence in suffix tree
        /// </summary>
        /// <param name="suffixTree">Suffix tree to searh on</param>
        /// <param name="searchSequence">query sequence to find matches</param>
        /// <param name="lengthOfMUM">Minimum length of the match</param>
        /// <returns>Matches found</returns>
        private IList <MaxUniqueMatch> FindMatchWithOption(
            ISuffixTree suffixTree,
            ISequence searchSequence,
            long lengthOfMUM)
        {
            if (suffixTree == null)
            {
                throw new ArgumentNullException("suffixTree");
            }

            if (searchSequence == null)
            {
                throw new ArgumentNullException("searchSequence");
            }

            IMultiWaySuffixTree mwSuffixTree = suffixTree as IMultiWaySuffixTree;

            if (mwSuffixTree == null)
            {
                throw new ArgumentNullException("suffixTree");
            }

            ValidateSequence(suffixTree.Sequence, searchSequence);

            // Initialize
            _minimumLengthOfMUM = lengthOfMUM;
            _suffixTree         = mwSuffixTree;
            InitializeReferenceSequence(suffixTree.Sequence);
            InitializeQuerySequence(searchSequence);

            int interval = (int)(_querySequence.Count - (_minimumLengthOfMUM - 1)) / Environment.ProcessorCount;

            if (interval < 1)
            {
                interval = 1;
            }

            IList <Task <List <MaxUniqueMatch> > > result = new List <Task <List <MaxUniqueMatch> > >();

            for (int index = 0; index < _querySequence.Count - (_minimumLengthOfMUM - 1); index += interval)
            {
                int taskIndex = index;
                result.Add(
                    Task.Factory.StartNew <List <MaxUniqueMatch> >(
                        o => FindMUMs(taskIndex, interval),
                        TaskCreationOptions.None));
            }

            List <MaxUniqueMatch> mergedList = new List <MaxUniqueMatch>();

            foreach (List <MaxUniqueMatch> local in result.Select(l => l.Result))
            {
                // Check if there is overlap, last MUM of mergedList overlaps with first MUM of local
                if (0 == mergedList.Count)
                {
                    mergedList.AddRange(local.Select(m => m));
                }
                else
                {
                    if (0 < local.Count)
                    {
                        MaxUniqueMatch previous = mergedList.Last();
                        MaxUniqueMatch current  = local.First();

                        if ((current.SecondSequenceStart >= previous.SecondSequenceStart &&
                             current.SecondSequenceStart <= previous.SecondSequenceStart + previous.Length) &&
                            (current.SecondSequenceStart + current.Length >= previous.SecondSequenceStart &&
                             current.SecondSequenceStart + current.Length <= previous.SecondSequenceStart + previous.Length))
                        {
                            local.RemoveAt(0);
                        }

                        if (0 < local.Count)
                        {
                            mergedList.AddRange(local.Select(m => m));
                        }
                    }
                }
            }
            // Order the mum list with query sequence order
            for (int index = 0; index < mergedList.Count; index++)
            {
                mergedList[index].FirstSequenceMumOrder  = index + 1;
                mergedList[index].SecondSequenceMumOrder = index + 1;
            }

            return(mergedList);
        }