public void TestClusterWithCross() { // Create a list of Mum classes. List <MaxUniqueMatch> matches = new List <MaxUniqueMatch>(); MaxUniqueMatch match = null; match = new MaxUniqueMatch(); match.FirstSequenceStart = 0; match.Length = 4; match.SecondSequenceStart = 4; matches.Add(match); match = new MaxUniqueMatch(); match.FirstSequenceStart = 4; match.Length = 3; match.SecondSequenceStart = 0; matches.Add(match); match = new MaxUniqueMatch(); match.FirstSequenceStart = 10; match.Length = 3; match.SecondSequenceStart = 10; matches.Add(match); IClusterBuilder clusterBuilder = new ClusterBuilder(); clusterBuilder.MinimumScore = 2; clusterBuilder.FixedSeparation = 0; IList <Cluster> actualOutput = clusterBuilder.BuildClusters(matches); IList <Cluster> expectedOutput = new List <Cluster>(); IList <MaxUniqueMatchExtension> clusterMatches = new List <MaxUniqueMatchExtension>(); match = new MaxUniqueMatch(); match.FirstSequenceStart = 0; match.Length = 4; match.SecondSequenceStart = 4; clusterMatches.Add(new MaxUniqueMatchExtension(match)); expectedOutput.Add(new Cluster(clusterMatches)); clusterMatches = new List <MaxUniqueMatchExtension>(); match = new MaxUniqueMatch(); match.FirstSequenceStart = 4; match.Length = 3; match.SecondSequenceStart = 0; clusterMatches.Add(new MaxUniqueMatchExtension(match)); expectedOutput.Add(new Cluster(clusterMatches)); clusterMatches = new List <MaxUniqueMatchExtension>(); match = new MaxUniqueMatch(); match.FirstSequenceStart = 10; match.Length = 3; match.SecondSequenceStart = 10; clusterMatches.Add(new MaxUniqueMatchExtension(match)); expectedOutput.Add(new Cluster(clusterMatches)); Assert.IsTrue(this.CompareMumList(actualOutput, expectedOutput)); }
public void TestMUMmer3GetMUMsMultipleMum() { string reference = "ATGCGCATCCCCTT"; string search = "GCGCCCCCTA"; Sequence referenceSeq = null; Sequence querySeq = null; referenceSeq = new Sequence(Alphabets.DNA, reference); querySeq = new Sequence(Alphabets.DNA, search); List <ISequence> querySeqs = new List <ISequence>(); querySeqs.Add(querySeq); MUMmer mummer = new MUMmer3(); mummer.LengthOfMUM = 4; var result = mummer.GetMUMs(referenceSeq, querySeqs); // Check if output is not null Assert.AreNotEqual(null, result); IDictionary <ISequence, IList <MaxUniqueMatch> > expectedOutput = new Dictionary <ISequence, IList <MaxUniqueMatch> >(); MaxUniqueMatch mum1 = new MaxUniqueMatch(); mum1.FirstSequenceMumOrder = 1; mum1.FirstSequenceStart = 2; mum1.SecondSequenceMumOrder = 1; mum1.SecondSequenceStart = 0; mum1.Length = 4; mum1.Query = querySeq; MaxUniqueMatch mum2 = new MaxUniqueMatch(); mum2.FirstSequenceMumOrder = 2; mum2.FirstSequenceStart = 8; mum2.SecondSequenceMumOrder = 2; mum2.SecondSequenceStart = 3; mum2.Length = 4; mum2.Query = querySeq; MaxUniqueMatch mum3 = new MaxUniqueMatch(); mum3.FirstSequenceMumOrder = 3; mum3.FirstSequenceStart = 8; mum3.SecondSequenceMumOrder = 3; mum3.SecondSequenceStart = 4; mum3.Length = 5; mum3.Query = querySeq; expectedOutput.Add(querySeq, new List <MaxUniqueMatch> { mum1, mum2, mum3 }); Assert.IsTrue(CompareMUMs(result, expectedOutput)); }
/// <summary> /// Copy the content to MUM /// </summary> /// <param name="match">Maximun unique match</param> public void CopyTo(MaxUniqueMatch match) { match.FirstSequenceMumOrder = FirstSequenceMumOrder; match.FirstSequenceStart = FirstSequenceStart; match.SecondSequenceMumOrder = SecondSequenceMumOrder; match.SecondSequenceStart = SecondSequenceStart; match.Length = Length; match.Query = Query; }
public void TestLISWithCross1() { // Create a list of Mum classes. List <MaxUniqueMatch> MUM = new List <MaxUniqueMatch>(); MaxUniqueMatch mum = null; mum = new MaxUniqueMatch(); mum.FirstSequenceStart = 0; mum.FirstSequenceMumOrder = 1; mum.Length = 4; mum.SecondSequenceStart = 4; mum.SecondSequenceMumOrder = 1; MUM.Add(mum); mum = new MaxUniqueMatch(); mum.FirstSequenceStart = 4; mum.FirstSequenceMumOrder = 2; mum.Length = 3; mum.SecondSequenceStart = 0; mum.SecondSequenceMumOrder = 2; MUM.Add(mum); mum = new MaxUniqueMatch(); mum.FirstSequenceStart = 10; mum.FirstSequenceMumOrder = 3; mum.Length = 3; mum.SecondSequenceStart = 10; mum.SecondSequenceMumOrder = 3; MUM.Add(mum); ILongestIncreasingSubsequence lis = new LongestIncreasingSubsequence(); IList <MaxUniqueMatch> lisList = lis.GetLongestSequence(MUM); List <MaxUniqueMatch> expectedOutput = new List <MaxUniqueMatch>(); mum = new MaxUniqueMatch(); mum.FirstSequenceStart = 0; mum.FirstSequenceMumOrder = 1; mum.Length = 4; mum.SecondSequenceStart = 4; mum.SecondSequenceMumOrder = 1; expectedOutput.Add(mum); mum = new MaxUniqueMatch(); mum.FirstSequenceStart = 10; mum.FirstSequenceMumOrder = 3; mum.Length = 3; mum.SecondSequenceStart = 10; mum.SecondSequenceMumOrder = 3; expectedOutput.Add(mum); Assert.IsTrue(this.CompareMumList(lisList, expectedOutput)); }
/// <summary> /// Finds the MUMs for suffix in given interval of query sequence /// </summary> /// <param name="startIndex">startindex of interval</param> /// <param name="interval">length of interval</param> /// <returns></returns> private List <MaxUniqueMatch> FindMUMs(int startIndex, int interval) { int secondSequenceStart = 0; int secondSequenceEnd = 0; List <MaxUniqueMatch> mumList = new List <MaxUniqueMatch>(); MaxUniqueMatch match = null; for (int index = startIndex; index < startIndex + interval && index < _searchSequence.Count; index++) { // loop through each suffix of search sequence and find the MUM in suffixTree match = Search(index); if (null != match) { // Make sure the mum found does not already exists in query sequence if ((match.SecondSequenceStart >= secondSequenceStart && match.SecondSequenceStart <= secondSequenceEnd) && (match.SecondSequenceStart + match.Length >= secondSequenceStart && match.SecondSequenceStart + match.Length <= secondSequenceEnd)) { continue; } mumList.Add(match); secondSequenceStart = match.SecondSequenceStart; secondSequenceEnd = match.SecondSequenceStart + match.Length; if (_lastEdge.IsLeaf && match.SecondSequenceStart + match.Length == _searchSequence.Count) { // At index, we have found a MUM, such that there cannot be // another MUM (till index + Current MUM Length) who length is // greater then Current MUM index += _lastMatch.Length - 1; } } } // Order the mum list with query sequence order for (int index = 0; index < mumList.Count; index++) { mumList[index].FirstSequenceMumOrder = index + 1; mumList[index].SecondSequenceMumOrder = index + 1; } return(mumList); }
public void TestMUMmer3GetMUMsMaxMatch() { string reference = "TTTTAATTTTAG"; string search = "ACTTTTGGA"; Sequence referenceSeq = null; Sequence querySeq = null; List <ISequence> querySeqs = null; referenceSeq = new Sequence(Alphabets.DNA, reference); querySeq = new Sequence(Alphabets.DNA, search); querySeqs = new List <ISequence>(); querySeqs.Add(querySeq); MUMmer mummer = new MUMmer3(); mummer.LengthOfMUM = 3; var result = mummer.GetMUMs(referenceSeq, querySeqs); // Check if output is not null Assert.AreNotEqual(null, result); // Check the mums count. Assert.AreEqual(1, result.Count); Assert.AreEqual(0, result.Values.First().Count); mummer.MaximumMatchEnabled = true; result = mummer.GetMUMs(referenceSeq, querySeqs); IDictionary <ISequence, IList <MaxUniqueMatch> > expectedOutput = new Dictionary <ISequence, IList <MaxUniqueMatch> >(); MaxUniqueMatch mum = new MaxUniqueMatch(); mum.FirstSequenceMumOrder = 1; mum.FirstSequenceStart = 0; mum.SecondSequenceMumOrder = 1; mum.SecondSequenceStart = 2; mum.Length = 4; mum.Query = querySeq; expectedOutput.Add(querySeq, new List <MaxUniqueMatch> { mum }); Assert.IsTrue(CompareMUMs(result, expectedOutput)); }
/// <summary> /// Validate following conditions and create MUM only if valid /// 1. Make sure there is no split edge in reference sequence (this /// represent duplicate in reference sequence) /// 2. Validate required length of MUM /// </summary> /// <param name="referenceEndIndex">End index of string found in reference sequence</param> /// <param name="queryStartIndex">Start index of string found in query sequence</param> /// <param name="length">Length of match</param> /// <returns>Maximum Unique Match</returns> private MaxUniqueMatch CreateMUM( int referenceEndIndex, int queryStartIndex, int length) { MaxUniqueMatch newMUM = null; if (length >= _minimumLengthOfMUM) { newMUM = new MaxUniqueMatch(); newMUM.Query = _searchSequence; newMUM.FirstSequenceStart = referenceEndIndex - (length - 1); newMUM.SecondSequenceStart = queryStartIndex; newMUM.Length = length; } return(newMUM); }
public void TestMUMmer3GetFinalMUMsWithRNASingleMum() { string reference = "AUGCSWRYKMBVHDN"; string search = "UAUASWRYBB"; Sequence referenceSeq = null; Sequence querySeq = null; List <ISequence> querySeqs = null; referenceSeq = new Sequence(Alphabets.RNA, reference); querySeq = new Sequence(Alphabets.RNA, search); querySeqs = new List <ISequence>(); querySeqs.Add(querySeq); MUMmer3 mummer = new MUMmer3(); mummer.LengthOfMUM = 3; var result = mummer.GetMUMs(referenceSeq, querySeqs, true); // Check if output is not null Assert.AreNotEqual(null, result); IDictionary <ISequence, IList <MaxUniqueMatch> > expectedOutput = new Dictionary <ISequence, IList <MaxUniqueMatch> >(); MaxUniqueMatch mum1 = new MaxUniqueMatch(); mum1.FirstSequenceMumOrder = 1; mum1.FirstSequenceStart = 4; mum1.SecondSequenceMumOrder = 1; mum1.SecondSequenceStart = 4; mum1.Length = 4; mum1.Query = querySeq; expectedOutput.Add(querySeq, new List <MaxUniqueMatch> { mum1 }); Assert.IsTrue(CompareMUMs(result, expectedOutput)); }
/// <summary> /// Search for a query sequence in give Suffix Tree for existence /// </summary> /// <param name="startIndex">Index of first suffix character in search sequence</param> /// <returns>Does query sequence exists</returns> private MaxUniqueMatch Search(int startIndex) { // if the input sequence is empty if (0 == _queryString.Length) { return(null); } int hash; Edge edge = null; hash = Hash(0, _queryString[startIndex]); // if edge that starts with start character does not exits if (!_suffixTree.Edges.TryGetValue(hash, out edge)) { return(null); } if (-1 == edge.StartNode) { return(null); } int queryIndex = startIndex; int referenceIndex = 0; MaxUniqueMatch match = null; bool matchFound = false; Edge nextEdge = null; while (!matchFound) { for (referenceIndex = edge.StartIndex; referenceIndex <= edge.EndIndex; referenceIndex++) { if (queryIndex == _queryString.Length || referenceIndex == _referenceString.Length) { match = CreateMUM( referenceIndex - 1, startIndex, queryIndex - startIndex); matchFound = true; break; } if (_referenceString[referenceIndex] != _queryString[queryIndex++]) { match = CreateMUM( referenceIndex - 1, startIndex, queryIndex - 1 - startIndex); matchFound = true; break; } } if (!matchFound) { if (queryIndex < _queryString.Length) { hash = Hash(edge.EndNode, _queryString[queryIndex]); // If the node exists in Edge list // Find the edge and return it while (_suffixTree.Edges.TryGetValue(hash, out nextEdge)) { if (_queryString[queryIndex] == _referenceString[nextEdge.StartIndex]) { break; } hash = ++hash % _maxHashKey; } if (null == nextEdge) { match = CreateMUM( edge.EndIndex, startIndex, queryIndex - startIndex); matchFound = true; } else { edge = nextEdge; } } else { match = CreateMUM( edge.EndIndex, startIndex, queryIndex - startIndex); matchFound = true; } } } if (null == match) { return(null); } _lastEdge = edge; _lastMatch = match; // Make sure there is not split, if there is split, then this is a duplicate // and should be ignored. // And the length of match is greater then minimum required length if (!edge.IsLeaf) { match = null; } return(match); }
/// <summary> /// Find the matches of sequence in suffix tree /// </summary> /// <param name="suffixTree">Suffix Tree</param> /// <param name="searchSequence">Query searchSequence</param> /// <param name="lengthOfMUM">Mininum length of MUM</param> /// <returns>Matches found</returns> public IList <MaxUniqueMatch> FindMatches( SequenceSuffixTree suffixTree, ISequence searchSequence, long lengthOfMUM) { if (suffixTree == null) { throw new ArgumentNullException("suffixTree"); } if (searchSequence == null) { throw new ArgumentNullException("searchSequence"); } // Initialize _referenceString = string.Empty; _minimumLengthOfMUM = lengthOfMUM; _suffixTree = suffixTree; _searchSequence = searchSequence; _queryString = _searchSequence.ToString(); SegmentedSequence referenceSequence = _suffixTree.Sequence as SegmentedSequence; if (null != referenceSequence) { foreach (Sequence sequence in referenceSequence.Sequences) { _referenceString += sequence.ToString() + CONCATENATING_SYMBOL; } // remove the concatenating symbol form end and add terminating symbol _referenceString = _referenceString.TrimEnd(CONCATENATING_SYMBOL); _referenceString += TERMINATING_SYMBOL; } else { _referenceString = _suffixTree.Sequence.ToString() + TERMINATING_SYMBOL; } int interval = (int)(_queryString.Length - (_minimumLengthOfMUM - 1)) / Environment.ProcessorCount; if (interval < 1) { interval = 1; } IList <Task <List <MaxUniqueMatch> > > result = new List <Task <List <MaxUniqueMatch> > >(); for (int index = 0; index < _queryString.Length - (_minimumLengthOfMUM - 1); index += interval) { int taskIndex = index; result.Add( Task.Factory.StartNew <List <MaxUniqueMatch> >( o => FindMUMs(taskIndex, interval), TaskCreationOptions.None)); } List <MaxUniqueMatch> mergedList = new List <MaxUniqueMatch>(); foreach (List <MaxUniqueMatch> local in result.Select(l => l.Result)) { // Check if there is overlap, last MUM of mergedList overlaps with first MUM of local if (0 == mergedList.Count) { mergedList.AddRange(local.Select(m => m)); } else { if (0 < local.Count) { MaxUniqueMatch previous = mergedList.Last(); MaxUniqueMatch current = local.First(); if ((current.SecondSequenceStart >= previous.SecondSequenceStart && current.SecondSequenceStart <= previous.SecondSequenceStart + previous.Length) && (current.SecondSequenceStart + current.Length >= previous.SecondSequenceStart && current.SecondSequenceStart + current.Length <= previous.SecondSequenceStart + previous.Length)) { local.RemoveAt(0); } if (0 < local.Count) { mergedList.AddRange(local.Select(m => m)); } } } } // Order the mum list with query sequence order for (int index = 0; index < mergedList.Count; index++) { mergedList[index].FirstSequenceMumOrder = index + 1; mergedList[index].SecondSequenceMumOrder = index + 1; } return(mergedList); }
/// <summary> /// Initializes a new instance of the MaxUniqueMatchExtension class /// </summary> /// <param name="mum">Maximum Unique Match</param> public MaxUniqueMatchExtension(MaxUniqueMatch mum) { mum.CopyTo(this); IsGood = false; IsTentative = false; }
/// <summary> /// Search for a query sequence in give Suffix Tree for existence /// </summary> /// <param name="startIndex">Index of first suffix character in search sequence</param> /// <returns>Does query sequence exists</returns> private MaxUniqueMatch Search(int startIndex) { // if the input sequence is empty if (0 == _querySequence.Count) { return(null); } IEdge edge = _suffixTree.Find(_suffixTree.Root, GetQuerySymbol(startIndex)); // if edge that starts with start character does not exits if (edge == null) { return(null); } int queryIndex = startIndex; int referenceIndex = 0; MaxUniqueMatch match = null; bool matchFound = false; IEdge nextEdge = null; while (!matchFound) { for (referenceIndex = edge.StartIndex; referenceIndex <= edge.EndIndex; referenceIndex++) { if (queryIndex == _querySequence.Count || referenceIndex == ReferenceLength) { match = CreateMUM( referenceIndex - 1, startIndex, queryIndex - startIndex); matchFound = true; break; } if (GetReferenceSymbol(referenceIndex) != GetQuerySymbol(queryIndex)) { match = CreateMUM( referenceIndex - 1, startIndex, queryIndex - startIndex); matchFound = true; break; } queryIndex++; } if (!matchFound) { if (queryIndex < _querySequence.Count) { nextEdge = _suffixTree.Find(edge, GetQuerySymbol(queryIndex)); if (null == nextEdge) { match = CreateMUM( edge.EndIndex, startIndex, queryIndex - startIndex); matchFound = true; } else { edge = nextEdge; } } else { match = CreateMUM( edge.EndIndex, startIndex, queryIndex - startIndex); matchFound = true; } } } if (null == match) { return(null); } _lastEdge = edge; _lastMatch = match; // Make sure there is not split, if there is split, then this is a duplicate // and should be ignored. // And the length of match is greater then minimum required length if (!edge.IsLeaf && !_findMaximumMatch) { match = null; } return(match); }
/// <summary> /// Find the matches of sequence in suffix tree /// </summary> /// <param name="suffixTree">Suffix tree to searh on</param> /// <param name="searchSequence">query sequence to find matches</param> /// <param name="lengthOfMUM">Minimum length of the match</param> /// <returns>Matches found</returns> private IList <MaxUniqueMatch> FindMatchWithOption( ISuffixTree suffixTree, ISequence searchSequence, long lengthOfMUM) { if (suffixTree == null) { throw new ArgumentNullException("suffixTree"); } if (searchSequence == null) { throw new ArgumentNullException("searchSequence"); } IMultiWaySuffixTree mwSuffixTree = suffixTree as IMultiWaySuffixTree; if (mwSuffixTree == null) { throw new ArgumentNullException("suffixTree"); } ValidateSequence(suffixTree.Sequence, searchSequence); // Initialize _minimumLengthOfMUM = lengthOfMUM; _suffixTree = mwSuffixTree; InitializeReferenceSequence(suffixTree.Sequence); InitializeQuerySequence(searchSequence); int interval = (int)(_querySequence.Count - (_minimumLengthOfMUM - 1)) / Environment.ProcessorCount; if (interval < 1) { interval = 1; } IList <Task <List <MaxUniqueMatch> > > result = new List <Task <List <MaxUniqueMatch> > >(); for (int index = 0; index < _querySequence.Count - (_minimumLengthOfMUM - 1); index += interval) { int taskIndex = index; result.Add( Task.Factory.StartNew <List <MaxUniqueMatch> >( o => FindMUMs(taskIndex, interval), TaskCreationOptions.None)); } List <MaxUniqueMatch> mergedList = new List <MaxUniqueMatch>(); foreach (List <MaxUniqueMatch> local in result.Select(l => l.Result)) { // Check if there is overlap, last MUM of mergedList overlaps with first MUM of local if (0 == mergedList.Count) { mergedList.AddRange(local.Select(m => m)); } else { if (0 < local.Count) { MaxUniqueMatch previous = mergedList.Last(); MaxUniqueMatch current = local.First(); if ((current.SecondSequenceStart >= previous.SecondSequenceStart && current.SecondSequenceStart <= previous.SecondSequenceStart + previous.Length) && (current.SecondSequenceStart + current.Length >= previous.SecondSequenceStart && current.SecondSequenceStart + current.Length <= previous.SecondSequenceStart + previous.Length)) { local.RemoveAt(0); } if (0 < local.Count) { mergedList.AddRange(local.Select(m => m)); } } } } // Order the mum list with query sequence order for (int index = 0; index < mergedList.Count; index++) { mergedList[index].FirstSequenceMumOrder = index + 1; mergedList[index].SecondSequenceMumOrder = index + 1; } return(mergedList); }