/// <summary> /// Initializes a new instance of the MaxUniqueMatchExtension class /// </summary> /// <param name="mum">Maximum Unique Match</param> public MatchExtension(Match mum) { this.ReferenceSequenceOffset = mum.ReferenceSequenceOffset; this.QuerySequenceOffset = mum.QuerySequenceOffset; this.Length = mum.Length; this.IsGood = false; this.IsTentative = false; }
public void TestLISWithCross1() { // Create a list of Mum classes. List<Match> MUM = new List<Match>(); Match mum; mum = new Match(); mum.ReferenceSequenceOffset = 0; mum.Length = 4; mum.QuerySequenceOffset = 4; MUM.Add(mum); mum = new Match(); mum.ReferenceSequenceOffset = 4; mum.Length = 3; mum.QuerySequenceOffset = 0; MUM.Add(mum); mum = new Match(); mum.ReferenceSequenceOffset = 10; mum.Length = 3; mum.QuerySequenceOffset = 10; MUM.Add(mum); //ILongestIncreasingSubsequence lis = new LongestIncreasingSubsequence(); LongestIncreasingSubsequence lis = new LongestIncreasingSubsequence(); IList<Match> lisList = lis.SortMum(MUM); IList<Match> lisList1 = lis.GetLongestSequence(lisList); List<Match> expectedOutput = new List<Match>(); mum = new Match(); mum.ReferenceSequenceOffset = 0; mum.Length = 4; mum.QuerySequenceOffset = 4; expectedOutput.Add(mum); mum = new Match(); mum.ReferenceSequenceOffset = 10; mum.Length = 3; mum.QuerySequenceOffset = 10; expectedOutput.Add(mum); Assert.IsTrue(CompareMumList(lisList1, expectedOutput)); }
/// <summary> /// Find the longest increasing sub sequence from the given set of MUMs. /// </summary> /// <param name="sortedMums">List of sorted MUMs.</param> /// <returns>Longest Increasing Subsequence.</returns> public IList<Match> GetLongestSequence(IList<Match> sortedMums) { if (sortedMums == null) { return null; } MatchExtension[] matches = ConvertToMUMExtension(sortedMums); for (var counteri = 0; counteri < matches.Length; counteri++) { var matches_i = matches[counteri]; // Initialize the MUM Extension matches_i.Score = matches[counteri].Length; matches_i.WrapScore = matches[counteri].Length; matches_i.Adjacent = 0; matches_i.From = -1; for (var counterj = 0; counterj < counteri; counterj++) { MatchExtension matches_j = matches[counterj]; // Find the overlap in query sequence of MUM var overlap2 = matches_j.QuerySequenceOffset + matches_j.Length; overlap2 -= matches_i.QuerySequenceOffset; var overlap = overlap2 > 0 ? overlap2 : 0; // Calculate the score for query sequence of MUM var score = matches_j.Score + matches_i.Length - overlap; if (score > matches_i.WrapScore) { matches_i.WrapScore = score; } // Find the overlap in reference sequence of MUM var overlap1 = matches_j.ReferenceSequenceOffset + matches_j.Length - matches_i.ReferenceSequenceOffset; overlap = overlap > overlap1 ? overlap : overlap1; score = matches_j.Score + matches_i.Length - overlap; if (score > matches_i.Score) { // To remove crosses, mark counteri as next MUM From counterj // without any crosses matches_i.From = counterj; // Set the new score and overlap after removing the cross matches_i.Score = score; matches_i.Adjacent = overlap; } // Calculate the score for reference sequence of MUM score = matches_j.WrapScore + matches_i.Length - overlap; if (score >= matches_i.WrapScore) { matches_i.WrapScore = score; } } } // Find the best longest increasing subsequence // Sequence with highest score is the longest increasing subsequence long best = 0; long bestScore = matches[best].Score; for (long counteri = 1; counteri < matches.Length; counteri++) { if (matches[counteri].Score > bestScore) { best = counteri; bestScore = matches[best].Score; } } // Mark the MUMs in longest increasing subsequence as "Good" for (long counteri = best; counteri >= 0; counteri = matches[counteri].From) { matches[counteri].IsGood = true; } IList<Match> outputMums = new List<Match>(); foreach (MatchExtension t in matches) { if (t.IsGood) { var adjacent = t.Adjacent; if (0 != adjacent) { t.ReferenceSequenceOffset += adjacent; t.QuerySequenceOffset += adjacent; t.Length -= adjacent; } if (0 < t.Length) { Match match = new Match(); match.Length = t.Length; match.QuerySequenceOffset = t.QuerySequenceOffset; match.ReferenceSequenceOffset = t.ReferenceSequenceOffset; outputMums.Add(match); } } } // Return the list of MUMs that represent the longest increasing subsequence return outputMums; }
/// <summary> /// Align the Gap by executing pairwise alignment. /// </summary> /// <param name="referenceSequence">Reference sequence.</param> /// <param name="querySequence">Query Sequence.</param> /// <param name="sequenceResult1">Editable sequence containing alignment first result.</param> /// <param name="sequenceResult2">Editable sequence containing alignment second result.</param> /// <param name="consensusResult">Editable sequence containing consensus sequence.</param> /// <param name="mum1">First MUM of Gap.</param> /// <param name="mum2">Second MUM of Gap.</param> /// <param name="insertions">Insertions made to the aligned sequences.</param> /// <returns>Score of alignment.</returns> private long AlignGap( ISequence referenceSequence, ISequence querySequence, List<byte> sequenceResult1, List<byte> sequenceResult2, List<byte> consensusResult, Match mum1, Match mum2, out List<long> insertions) { long score = 0; ISequence sequence1 = null; ISequence sequence2 = null; IList<IPairwiseSequenceAlignment> sequenceAlignment = null; byte[] mum1String; byte[] mum2String; insertions = new List<long>(2); insertions.Add(0); insertions.Add(0); long mum1ReferenceStartIndex = 0; long mum1QueryStartIndex = 0; long mum1Length = 0; long mum2ReferenceStartIndex = 0; long mum2QueryStartIndex = 0; long mum2Length = 0; if (mum1.Length != 0) { mum1ReferenceStartIndex = mum1.ReferenceSequenceOffset; mum1QueryStartIndex = mum1.QuerySequenceOffset; mum1Length = mum1.Length; } if (mum2.Length != 0) { mum2ReferenceStartIndex = mum2.ReferenceSequenceOffset; mum2QueryStartIndex = mum2.QuerySequenceOffset; mum2Length = mum2.Length; } else { mum2ReferenceStartIndex = referenceSequence.Count; mum2QueryStartIndex = querySequence.Count; } long referenceGapStartIndex = mum1ReferenceStartIndex + mum1Length; long queryGapStartIndex = mum1QueryStartIndex + mum1Length; if (mum2ReferenceStartIndex > referenceGapStartIndex && mum2QueryStartIndex > queryGapStartIndex) { sequence1 = referenceSequence.GetSubSequence( referenceGapStartIndex, mum2ReferenceStartIndex - referenceGapStartIndex); sequence2 = querySequence.GetSubSequence( queryGapStartIndex, mum2QueryStartIndex - queryGapStartIndex); sequenceAlignment = this.RunPairWise(sequence1, sequence2); if (sequenceAlignment != null) { foreach (IPairwiseSequenceAlignment pairwiseAlignment in sequenceAlignment) { foreach (PairwiseAlignedSequence alignment in pairwiseAlignment.PairwiseAlignedSequences) { sequenceResult1.InsertRange( sequenceResult1.Count, alignment.FirstSequence); sequenceResult2.InsertRange( sequenceResult2.Count, alignment.SecondSequence); consensusResult.InsertRange( consensusResult.Count, alignment.Consensus); score += alignment.Score; if (alignment.Metadata.ContainsKey("Insertions")) { List<int> gapinsertions = alignment.Metadata["Insertions"] as List<int>; if (gapinsertions != null) { if (gapinsertions.Count > 0) { insertions[0] += gapinsertions[0]; } if (gapinsertions.Count > 1) { insertions[1] += gapinsertions[1]; } } } } } } } else if (mum2ReferenceStartIndex > referenceGapStartIndex) { sequence1 = referenceSequence.GetSubSequence( referenceGapStartIndex, mum2ReferenceStartIndex - referenceGapStartIndex); sequenceResult1.InsertRange(sequenceResult1.Count, sequence1); sequenceResult2.InsertRange(sequenceResult2.Count, CreateDefaultGap(sequence1.Count)); consensusResult.InsertRange(consensusResult.Count, sequence1); insertions[1] += sequence1.Count; if (this.UseGapExtensionCost) { score = this.GapOpenCost + ((sequence1.Count - 1) * this.GapExtensionCost); } else { score = sequence1.Count * this.GapOpenCost; } } else if (mum2QueryStartIndex > queryGapStartIndex) { sequence2 = querySequence.GetSubSequence( queryGapStartIndex, mum2QueryStartIndex - queryGapStartIndex); sequenceResult1.InsertRange(sequenceResult1.Count, CreateDefaultGap(sequence2.Count)); sequenceResult2.InsertRange(sequenceResult2.Count, sequence2); consensusResult.InsertRange(consensusResult.Count, sequence2); insertions[0] += sequence2.Count; if (this.UseGapExtensionCost) { score = this.GapOpenCost + ((sequence2.Count - 1) * this.GapExtensionCost); } else { score = sequence2.Count * this.GapOpenCost; } } // Add the MUM to the result if (0 < mum2Length) { mum1String = referenceSequence.GetSubSequence( mum2ReferenceStartIndex, mum2Length).ToArray(); sequenceResult1.InsertRange(sequenceResult1.Count, mum1String); mum2String = querySequence.GetSubSequence( mum2QueryStartIndex, mum2Length).ToArray(); sequenceResult2.InsertRange(sequenceResult2.Count, mum2String); consensusResult.InsertRange(consensusResult.Count, mum1String); foreach (byte index in mum1String) { score += SimilarityMatrix[index, index]; } } return score; }
public void ValidateMatchAndMatchExtensionToString() { var match = new Match(); match.Length = 20; match.QuerySequenceOffset = 33; var matchExtn = new MatchExtension(match); matchExtn.ID = 1; matchExtn.Length = 20; string actualMatchExtnString = matchExtn.ToString(); string actualMatchstring = match.ToString(); string ExpectedMatchExtnString = this.utilityObj.xmlUtil.GetTextValue(Constants.ToStringNodeName, Constants.ExpectedMatchExtnStringNode); string ExpectedMatchString = this.utilityObj.xmlUtil.GetTextValue(Constants.ToStringNodeName, Constants.ExpectedMatchStringNode); Assert.AreEqual(ExpectedMatchExtnString, actualMatchExtnString); Assert.AreEqual(actualMatchstring, ExpectedMatchString); }
public void ValidateClusterToString() { var match = new Match(); var matchExtn1 = new MatchExtension(match); matchExtn1.ID = 1; matchExtn1.Length = 20; var matchExtn2 = new MatchExtension(match); matchExtn2.ID = 2; matchExtn2.Length = 30; IList<MatchExtension> extnList = new List<MatchExtension>(); extnList.Add(matchExtn1); extnList.Add(matchExtn2); var clust = new Cluster(extnList); string actualString = clust.ToString(); string expectedString = this.utilityObj.xmlUtil.GetTextValue(Constants.ToStringNodeName, Constants.ClusterExpectedNode); Assert.AreEqual(actualString, expectedString.Replace("\\r\\n", System.Environment.NewLine)); }
/// <summary> /// Gets the matches where length is greater than or equal to the MinLengthOfMatch. /// </summary> /// <param name="searchSequence">Query sequence to search.</param> /// <returns>Returns IEnumerable of matches.</returns> public IEnumerable<Match> SearchMatches(ISequence searchSequence) { // LastQueryEndIndex -> (LastQueryStartIndex - LastRefStartIndex )-> LastRefEndIndex -> LastRefStartIndex var overlappingMatches = new SortedList<long, Dictionary<long, SortedList<long, SortedSet<long>>>>(); var edgesFound = new Stack<EdgesFound>(); long minLengthOfMatch = this.MinLengthOfMatch; bool noambiguity = this.NoAmbiguity; long queryIndex; long querySequenceLength = searchSequence.Count; long lengthOfMatchFound = 0; var match = new Match(); if (minLengthOfMatch <= 0) { throw new ArgumentOutOfRangeException(Resource.MinLengthMustBeGreaterThanZero); } if (!(searchSequence is Sequence)) { throw new ArgumentException(Resource.OnlySequenceClassSupported); } // Get base alphabet of the searchSequence. IAlphabet searchSeqBaseAlphabet = searchSequence.Alphabet; IAlphabet alphabet; while (Alphabets.AlphabetToBaseAlphabetMap.TryGetValue(searchSeqBaseAlphabet, out alphabet)) { searchSeqBaseAlphabet = alphabet; } // If base alphabets are not same then throw the exception. if (searchSeqBaseAlphabet != this.supportedBaseAlphabet) { throw new ArgumentException(Resource.AlphabetMisMatch); } ISequence convertedSearchSeq = ProcessQuerySequence(searchSequence, noambiguity); long lengthOfMatchInEdge = 0; long edgeStartIndex = 0; long childStartIndexToSkip = -1; MultiWaySuffixEdge edge = this.rootEdge; MultiWaySuffixEdge previousIntermediateEdge = this.rootEdge; for (queryIndex = 0; queryIndex <= querySequenceLength - minLengthOfMatch; queryIndex++) { // if the previousIntermediateEdge is rootEdge then start from the begining. if (previousIntermediateEdge.StartIndex == -1 && lengthOfMatchInEdge > 0) { lengthOfMatchInEdge--; } MultiWaySuffixEdge suffixLink = previousIntermediateEdge.SuffixLink[0]; MultiWaySuffixEdge childEdgePointToParent = previousIntermediateEdge; bool suffixLinkPointsToParentEdge = false; // Verify whether SuffixLink points to its parent or not. if (suffixLink.StartIndex == previousIntermediateEdge.StartIndex - 1 && previousIntermediateEdge.SuffixLink[0].StartIndex != -1) { int suffixLinkChildCount = suffixLink.Children.Length; for (int suffixLinkChildIndex = 0; suffixLinkChildIndex < suffixLinkChildCount; suffixLinkChildIndex++) { if (suffixLink.Children[suffixLinkChildIndex].Children == previousIntermediateEdge.Children) { suffixLinkPointsToParentEdge = true; edgesFound.Clear(); break; } } } // Go to the next query index by following the suffix link of the previousintermediate edge. // This will reduce the searching from the root. In this case lengthOfMatchFound will be deducted by 1. // As suffix link always point to another intermediate edge. // Note: suffix link for the root is root ifself. previousIntermediateEdge = suffixLink; lengthOfMatchFound--; if (lengthOfMatchFound < 0) { lengthOfMatchFound = 0; } long searchIndex = queryIndex + lengthOfMatchFound - lengthOfMatchInEdge; int childCount = previousIntermediateEdge.Children.Length; byte refSymbol, querySymbol; if (lengthOfMatchInEdge > 0) { querySymbol = convertedSearchSeq[searchIndex]; for (int index = 0; index < childCount; index++) { edge = previousIntermediateEdge.Children[index]; edgeStartIndex = edge.StartIndex; refSymbol = TerminatingSymbol; if (edgeStartIndex < this.symbolsCount) { refSymbol = this.referenceSequence[edgeStartIndex]; } if (refSymbol == querySymbol) { break; } } // When lengthOfMatchInEdge >0 there will be an edge from the previousIntermediateEdge. while (!edge.IsLeaf) { long edgeEndIndex = edge.Children[0].StartIndex - 1; // compare the first symbol of the edge. long edgeSymbolCount = edgeEndIndex - edgeStartIndex + 1; if (lengthOfMatchInEdge == edgeSymbolCount) { searchIndex += lengthOfMatchInEdge; if (searchIndex != querySequenceLength) { lengthOfMatchInEdge = 0; previousIntermediateEdge = edge; } break; } if (lengthOfMatchInEdge > edgeSymbolCount) { lengthOfMatchInEdge -= edgeSymbolCount; searchIndex += edgeSymbolCount; long edgeChildCount = edge.Children.Length; querySymbol = convertedSearchSeq[searchIndex]; for (int edgeChildIndex = 0; edgeChildIndex < edgeChildCount; edgeChildIndex++) { if (this.referenceSequence[edge.Children[edgeChildIndex].StartIndex] == querySymbol) { // get the child of edge and continue searching. previousIntermediateEdge = edge; edgeStartIndex = edge.Children[edgeChildIndex].StartIndex; if (lengthOfMatchFound - lengthOfMatchInEdge >= minLengthOfMatch) { edgesFound.Push( new EdgesFound { Edge = previousIntermediateEdge, LengthOfMatch = lengthOfMatchFound - lengthOfMatchInEdge }); childStartIndexToSkip = edgeStartIndex; } edge = edge.Children[edgeChildIndex]; break; } } } else { break; } } } bool continueSearch = true; if (lengthOfMatchInEdge > 0) { // no need to continue with search as search is ended inside the edge. continueSearch = false; if (lengthOfMatchFound >= minLengthOfMatch) { // Set -1 so that it wont match with start index of any child edge. edgesFound.Push(new EdgesFound { Edge = edge, LengthOfMatch = lengthOfMatchFound }); childStartIndexToSkip = -1; } } if (queryIndex + lengthOfMatchFound >= querySequenceLength) { // no need continue with the seach as entaire query sequence is // searched and rest of the result can be found using suffix links. continueSearch = false; } while (continueSearch) { querySymbol = 0; if (searchIndex < querySequenceLength) { querySymbol = convertedSearchSeq[searchIndex]; } int edgeIndex = -1; childCount = previousIntermediateEdge.Children.Length; for (int childIndex = 0; childIndex < childCount; childIndex++) { edge = previousIntermediateEdge.Children[childIndex]; edgeStartIndex = edge.StartIndex; refSymbol = TerminatingSymbol; if (edgeStartIndex < this.symbolsCount) { refSymbol = this.referenceSequence[edgeStartIndex]; } if (refSymbol == querySymbol) { edgeIndex = childIndex; break; } } if (edgeIndex == -1) { lengthOfMatchInEdge = 0; continueSearch = false; if (lengthOfMatchFound >= minLengthOfMatch) { // Set -1 so that it wont match with start index of any child edge. edgesFound.Push( new EdgesFound { Edge = previousIntermediateEdge, LengthOfMatch = lengthOfMatchFound }); childStartIndexToSkip = -1; } } else { if (lengthOfMatchFound >= minLengthOfMatch) { edgesFound.Push( new EdgesFound { Edge = previousIntermediateEdge, LengthOfMatch = lengthOfMatchFound }); childStartIndexToSkip = edge.StartIndex; } searchIndex++; lengthOfMatchFound++; lengthOfMatchInEdge = 1; // Get the endIndex of the edge found. long edgeEndIndex = this.symbolsCount; if (!edge.IsLeaf) { // return the minimum start index of children -1 edgeEndIndex = edge.Children[0].StartIndex - 1; } long edgeLength = edgeEndIndex - edgeStartIndex + 1; for (long referenceIndex = edgeStartIndex + 1; referenceIndex <= edgeEndIndex; referenceIndex++) { refSymbol = TerminatingSymbol; if (referenceIndex < this.symbolsCount) { refSymbol = this.referenceSequence[referenceIndex]; } querySymbol = 0; if (searchIndex < querySequenceLength) { querySymbol = convertedSearchSeq[searchIndex]; } // Stop searching if any one of the following conditions is true. // 1. Reached end of the query sequence // 2. Reached end of the leaf edge. // 3. Symbols are not matching if (refSymbol != querySymbol) { break; } searchIndex++; lengthOfMatchFound++; lengthOfMatchInEdge++; } // Can't continue with search if the following conditions met thus add the edge to the stack. // 1. Edge is a leaf edge regardless of where the search ended. // 2. Edge is an intermediate edge and search ended inside the edge. // 3. searchIndex is equl to the length of the search sequence. (as we increment the searchIndex in advance). if (edge.IsLeaf || lengthOfMatchInEdge < edgeLength || searchIndex == querySequenceLength) { if (lengthOfMatchFound >= minLengthOfMatch) { // Set -1 so that it wont match with start index of any child edge. edgesFound.Push(new EdgesFound { Edge = edge, LengthOfMatch = lengthOfMatchFound }); childStartIndexToSkip = -1; } // go to the next queryIndex continueSearch = false; } else { // if the edge is completly searched and edge is an intemediate edge then continue with the search. previousIntermediateEdge = edge; } } } // first edge in the stack will be the search ended edge, so process it seperatly. if (edgesFound.Count > 0) { EdgesFound itemToDisplay = edgesFound.Pop(); edge = itemToDisplay.Edge; long matchLength = itemToDisplay.LengthOfMatch; long refIndex; if (edge.IsLeaf) { refIndex = edge.StartIndex + lengthOfMatchInEdge - matchLength; if (ValidateMatch(queryIndex, refIndex, matchLength, overlappingMatches, out match)) { yield return match; } } else { childCount = edge.Children.Length; long edgeLength = edge.Children[0].StartIndex - edge.StartIndex; var startIndexes = new List<long>(); // suffixLink.Children == edge.Children - reference check to identify the edge having suffix link pointing to its parent. if (suffixLinkPointsToParentEdge && childEdgePointToParent.Children == edge.Children) { startIndexes.Add(edge.StartIndex); } else { for (int childIndex = 0; childIndex < childCount; childIndex++) { if (edge.Children[childIndex].StartIndex == childStartIndexToSkip) { continue; } DepthFirstIterativeTraversal(edge.Children[childIndex], edgeLength, startIndexes); } startIndexes.Sort(); } int listCount = startIndexes.Count; for (int matchIndex = 0; matchIndex < listCount; matchIndex++) { long startIndex = startIndexes[matchIndex]; long edgeLengthToAdd = lengthOfMatchInEdge == 0 ? edgeLength : lengthOfMatchInEdge; refIndex = startIndex + edgeLengthToAdd - matchLength; if (ValidateMatch(queryIndex, refIndex, matchLength, overlappingMatches, out match)) { yield return match; } } startIndexes.Clear(); } // edgesFoundForNextQueryIndex is used for temporary storage and to maintain the order when it pushed to edgesFound stack. var edgesFoundForNextQueryIndex = new Stack<EdgesFound>(); EdgesFound previousItemToDisplay = itemToDisplay; // return the output and add the output the list to ignore the outputs that are not required. while (edgesFound.Count > 0) { itemToDisplay = edgesFound.Pop(); edge = itemToDisplay.Edge; matchLength = itemToDisplay.LengthOfMatch; if (!edge.IsLeaf && !previousItemToDisplay.Edge.IsLeaf && previousItemToDisplay.Edge.StartIndex != previousItemToDisplay.Edge.SuffixLink[0].StartIndex) { Stack<EdgesFound> tempStack = this.GetIntermediateEdges( edge, previousItemToDisplay.Edge, matchLength, previousItemToDisplay.LengthOfMatch - matchLength, queryIndex + 1, convertedSearchSeq, minLengthOfMatch); if (tempStack.Count > 0) { while (tempStack.Count > 0) { edgesFoundForNextQueryIndex.Push(tempStack.Pop()); } } } childCount = edge.Children.Length; long edgeLength = edge.Children[0].StartIndex - edge.StartIndex; var startIndexes = new List<long>(); HashSet<long> overlappingStartIndexes = itemToDisplay.StartIndexesFromPreviousMatchPathEdge; // suffixLink.Children == edge.Children - reference check to identify the edge having suffix link pointing to its parent. if (suffixLinkPointsToParentEdge && childEdgePointToParent.Children == edge.Children) { startIndexes.Add(edge.StartIndex); } else { for (int childIndex = 0; childIndex < childCount; childIndex++) { // if (edge.Children[childIndex].StartIndex == itemToDisplay.ChildStartIndexToSkip) if (edge.Children[childIndex].StartIndex == previousItemToDisplay.Edge.StartIndex) { continue; } DepthFirstIterativeTraversal(edge.Children[childIndex], edgeLength, startIndexes); } if (overlappingStartIndexes != null) { for (int index = startIndexes.Count - 1; index >= 0; index--) { if (overlappingStartIndexes.Contains(startIndexes[index])) { startIndexes.RemoveAt(index); } } } startIndexes.Sort(); } if (matchLength - 1 >= minLengthOfMatch) { var newEdgeFound = new EdgesFound { Edge = edge.SuffixLink[0], LengthOfMatch = matchLength - 1 }; HashSet<long> overlappingStartIndexesForNextQueryIndex = null; if (edge.StartIndex == edge.SuffixLink[0].StartIndex) { overlappingStartIndexesForNextQueryIndex = new HashSet<long>(); if (overlappingStartIndexes != null) { foreach (long startIndex in overlappingStartIndexes) { overlappingStartIndexesForNextQueryIndex.Add(startIndex); } } for (int index = startIndexes.Count - 1; index >= 0; index--) { overlappingStartIndexesForNextQueryIndex.Add(startIndexes[index]); } } newEdgeFound.StartIndexesFromPreviousMatchPathEdge = overlappingStartIndexesForNextQueryIndex; // get the suffix link for the edge and add them to the tempstack. edgesFoundForNextQueryIndex.Push(newEdgeFound); } int listCount = startIndexes.Count; for (int matchIndex = 0; matchIndex < listCount; matchIndex++) { long startIndex = startIndexes[matchIndex]; refIndex = startIndex + edgeLength - matchLength; if (ValidateMatch(queryIndex, refIndex, matchLength, overlappingMatches, out match)) { yield return match; } } startIndexes.Clear(); previousItemToDisplay = itemToDisplay; } if (matchLength > minLengthOfMatch && !suffixLinkPointsToParentEdge) { Stack<EdgesFound> tempStack = this.GetIntermediateEdges( this.rootEdge, previousItemToDisplay.Edge, 1, previousItemToDisplay.LengthOfMatch, queryIndex + 1, convertedSearchSeq, minLengthOfMatch); while (tempStack.Count > 0) { edgesFoundForNextQueryIndex.Push(tempStack.Pop()); } } // push the items in temp stack to the edgesFound stack while (edgesFoundForNextQueryIndex.Count > 0) { edgesFound.Push(edgesFoundForNextQueryIndex.Pop()); } } } }
/// <summary> /// Validates whether new match is an exact sub match with any of the previous matches if not then returns the match in /// out param. /// </summary> /// <param name="queryIndex">Query index</param> /// <param name="referenceIndex">Reference index</param> /// <param name="matchLength">Match length</param> /// <param name="previousMatches">Previous matches</param> /// <param name="match">New match</param> /// <returns>Returns true if the new match is not an exact sub match with any of the previous matches, else returns false</returns> private static bool ValidateMatch( long queryIndex, long referenceIndex, long matchLength, SortedList<long, Dictionary<long, SortedList<long, SortedSet<long>>>> previousMatches, out Match match) { bool isoverlapedMatchFound = false; long lastQueryEndIndex; int overlappingMatchesCount = previousMatches.Keys.Count(); if (overlappingMatchesCount > 0) { lastQueryEndIndex = previousMatches.Keys.Last(); if (lastQueryEndIndex < queryIndex) { previousMatches.Clear(); } } overlappingMatchesCount = previousMatches.Keys.Count(); for (int listIndex = overlappingMatchesCount - 1; listIndex >= 0; listIndex--) { lastQueryEndIndex = previousMatches.Keys[listIndex]; if (lastQueryEndIndex >= queryIndex + matchLength) { Dictionary<long, SortedList<long, SortedSet<long>>> diffMap = previousMatches[lastQueryEndIndex]; SortedList<long, SortedSet<long>> refEndIndexMap; if (diffMap.TryGetValue(queryIndex - referenceIndex, out refEndIndexMap)) { int refEndIndexCount = refEndIndexMap.Count; for (int refEndMapIndex = refEndIndexCount - 1; refEndMapIndex >= 0; refEndMapIndex--) { long refEndindex = refEndIndexMap.Keys[refEndMapIndex]; if (refEndindex >= referenceIndex + matchLength) { SortedSet<long> refStartIndexes = refEndIndexMap[refEndindex]; isoverlapedMatchFound = refStartIndexes.Any(refStartIndex => refStartIndex <= referenceIndex); if (isoverlapedMatchFound) { break; } } } if (isoverlapedMatchFound) { break; } } } else { if (lastQueryEndIndex < queryIndex) { previousMatches.Remove(lastQueryEndIndex); } break; } } match = new Match(); if (!isoverlapedMatchFound) { match.ReferenceSequenceOffset = referenceIndex; match.QuerySequenceOffset = queryIndex; match.Length = matchLength; long queryEndIndex = queryIndex + matchLength; long diffValue = queryIndex - referenceIndex; long refEndIndex = referenceIndex + matchLength; Dictionary<long, SortedList<long, SortedSet<long>>> diffsMap; SortedList<long, SortedSet<long>> refEndIndexMap; SortedSet<long> refStartIndexes; if (previousMatches.TryGetValue(queryEndIndex, out diffsMap)) { if (diffsMap.TryGetValue(diffValue, out refEndIndexMap)) { if (refEndIndexMap.TryGetValue(refEndIndex, out refStartIndexes)) { refStartIndexes.Add(referenceIndex); } else { refStartIndexes = new SortedSet<long>(); refStartIndexes.Add(referenceIndex); refEndIndexMap.Add(refEndIndex, refStartIndexes); } } else { refEndIndexMap = new SortedList<long, SortedSet<long>>(); refStartIndexes = new SortedSet<long>(); refStartIndexes.Add(referenceIndex); refEndIndexMap.Add(refEndIndex, refStartIndexes); diffsMap.Add(diffValue, refEndIndexMap); } } else { diffsMap = new Dictionary<long, SortedList<long, SortedSet<long>>>(); refEndIndexMap = new SortedList<long, SortedSet<long>>(); refStartIndexes = new SortedSet<long>(); refStartIndexes.Add(referenceIndex); refEndIndexMap.Add(refEndIndex, refStartIndexes); diffsMap.Add(diffValue, refEndIndexMap); previousMatches.Add(queryEndIndex, diffsMap); } } return !isoverlapedMatchFound; }
/// <summary> /// Gets the matches unique in reference sequence where length is greater than or equal to the MinLengthOfMatch. /// </summary> /// <param name="searchSequence">Sequence to search.</param> /// <returns>Returns IEnumerable of matches.</returns> public IEnumerable<Match> SearchMatchesUniqueInReference(ISequence searchSequence) { long minLengthOfMatch = this.MinLengthOfMatch; bool noambiguity = this.NoAmbiguity; long queryIndex = 0; long querySequenceLength = searchSequence.Count; long lastMatchQueryStart = 0; long lastMatchLength = 0; long lengthOfMatchFound = 0; var match = new Match(); // Get base alphabet of the searchSequence. IAlphabet searchSeqBaseAlphabet = searchSequence.Alphabet; IAlphabet alphabet; if (minLengthOfMatch <= 0) { throw new ArgumentOutOfRangeException(Resource.MinLengthMustBeGreaterThanZero); } if (!(searchSequence is Sequence)) { throw new ArgumentException(Resource.OnlySequenceClassSupported); } while (Alphabets.AlphabetToBaseAlphabetMap.TryGetValue(searchSeqBaseAlphabet, out alphabet)) { searchSeqBaseAlphabet = alphabet; } // If base alphabets are not same then throw the exception. if (searchSeqBaseAlphabet != this.supportedBaseAlphabet) { throw new ArgumentException(Resource.AlphabetMisMatch); } ISequence convertedSearchSeq = ProcessQuerySequence(searchSequence, noambiguity); long lengthOfMatchInEdge = 0; long edgeStartIndex = 0; MultiWaySuffixEdge edge = this.rootEdge; MultiWaySuffixEdge previousIntermediateEdge = this.rootEdge; for (queryIndex = 0; queryIndex <= querySequenceLength - minLengthOfMatch; queryIndex++) { if (previousIntermediateEdge.StartIndex == -1 && lengthOfMatchInEdge > 0) { lengthOfMatchInEdge--; } // As suffix link always point to another intermediate edge. // Note: suffix link for the root is root itself. previousIntermediateEdge = previousIntermediateEdge.SuffixLink[0]; int childCount = previousIntermediateEdge.Children.Length; lengthOfMatchFound--; if (lengthOfMatchFound < 0) { lengthOfMatchFound = 0; } long searchIndex = queryIndex + lengthOfMatchFound - lengthOfMatchInEdge; // if lengthOfMatchInEdge is greater than zero then instead of searching from the query index // try to jump to the edge starting at lengthOfMatchFound - lengthOfMatchInEdge distance from the root. // As previousIntermediateEdge is lengthOfMatchFound distance from the root find an edge in the path of // match such that lengthOfMatchInEdge will end inside that edge. byte refSymbol, querySymbol; if (lengthOfMatchInEdge > 0) { querySymbol = convertedSearchSeq[searchIndex]; for (int index = 0; index < childCount; index++) { edge = previousIntermediateEdge.Children[index]; edgeStartIndex = edge.StartIndex; refSymbol = TerminatingSymbol; if (edgeStartIndex < this.symbolsCount) { refSymbol = this.referenceSequence[edgeStartIndex]; } if (refSymbol == querySymbol) { break; } } // When lengthOfMatchInEdge > 0 there will be an edge from the previousIntermediateEdge in the path of match. while (!edge.IsLeaf) { long edgeEndIndex = edge.Children[0].StartIndex - 1; // compare the first symbol of the edge. long edgeSymbolCount = edgeEndIndex - edgeStartIndex + 1; if (lengthOfMatchInEdge == edgeSymbolCount) { previousIntermediateEdge = edge; searchIndex += lengthOfMatchInEdge; lengthOfMatchInEdge = 0; break; } if (lengthOfMatchInEdge > edgeSymbolCount) { lengthOfMatchInEdge -= edgeSymbolCount; searchIndex += edgeSymbolCount; long edgeChildCount = edge.Children.Length; querySymbol = convertedSearchSeq[searchIndex]; for (int edgeChildIndex = 0; edgeChildIndex < edgeChildCount; edgeChildIndex++) { if (this.referenceSequence[edge.Children[edgeChildIndex].StartIndex] == querySymbol) { // get the child of edge and continue searching. previousIntermediateEdge = edge; edge = edge.Children[edgeChildIndex]; edgeStartIndex = edge.StartIndex; break; } } } else { break; } } if (lengthOfMatchInEdge > 0) { // lengthOfMatchInEdge > 0 means search is not ending in an intermediate edge or at the endIndex of an edge, // so no need to continue with the search as there will be missmatch. continue; } } bool continueSearch = true; // start searching for the match by comparing the symbols. while (continueSearch) { querySymbol = 0; if (searchIndex < querySequenceLength) { querySymbol = convertedSearchSeq[searchIndex]; } int edgeIndex = -1; childCount = previousIntermediateEdge.Children.Length; for (int childIndex = 0; childIndex < childCount; childIndex++) { edge = previousIntermediateEdge.Children[childIndex]; edgeStartIndex = edge.StartIndex; refSymbol = TerminatingSymbol; if (edgeStartIndex < this.symbolsCount) { refSymbol = this.referenceSequence[edgeStartIndex]; } if (refSymbol == querySymbol) { searchIndex++; edgeIndex = childIndex; lengthOfMatchFound++; lengthOfMatchInEdge = 1; break; } } // if edge not found. if (edgeIndex == -1) { // Since the previous edge is an intermediate edge the match is repeated in the reference sequence. // Thus even though the match length is greater than or equal to the MinLengthOfMatch don't consider the match. // Go to the next query index by following the suffix link of the previous intermediate edge. // This will reduce time required for searching from the root. In this case lengthOfMatchFound will be deducted by 1. break; } // Get the endIndex of the edge found. long edgeEndIndex = this.symbolsCount; if (!edge.IsLeaf) { // return the minimum start index of children -1 edgeEndIndex = edge.Children[0].StartIndex - 1; } for (long referenceIndex = edgeStartIndex + 1; referenceIndex <= edgeEndIndex; referenceIndex++) { refSymbol = TerminatingSymbol; if (referenceIndex < this.symbolsCount) { refSymbol = this.referenceSequence[referenceIndex]; } querySymbol = 0; if (searchIndex < querySequenceLength) { querySymbol = convertedSearchSeq[searchIndex]; } // Stop searching if any one of the following conditions is true. // 1. Reached end of the query sequence // 2. Reached end of the leaf edge. // 3. Symbols are not matching if (refSymbol != querySymbol) { break; } searchIndex++; lengthOfMatchFound++; lengthOfMatchInEdge++; } // if it is a leaf node if (edge.IsLeaf) { // if the match length is greater than or equal to the minLengthOfMatch then yield the match. if (lengthOfMatchFound >= minLengthOfMatch && queryIndex + lengthOfMatchFound > lastMatchQueryStart + lastMatchLength) { match = new Match { ReferenceSequenceOffset = edgeStartIndex + lengthOfMatchInEdge - lengthOfMatchFound, QuerySequenceOffset = queryIndex, Length = lengthOfMatchFound }; yield return match; if (searchIndex >= querySequenceLength - 1) { // reached the end of the query sequence, no further search needed. continueSearch = false; queryIndex = querySequenceLength; break; } lastMatchLength = lengthOfMatchFound; lastMatchQueryStart = queryIndex; } // go to the next queryIndex continueSearch = false; } else { // if the search is ended // if the edge is an intermediate node then ignore the match and go to the next queryIndex. if (lengthOfMatchInEdge < (edgeEndIndex - edgeStartIndex + 1)) { continueSearch = false; } else { // if the edge is completely searched, then continue with the search. lengthOfMatchInEdge = 0; previousIntermediateEdge = edge; } } } } }
/// <summary> /// Parses MUMs from the input file. /// </summary> /// <param name="filename">MUM file name.</param> /// <returns>List of MUMs.</returns> private static IList<Match> ParseMums(string filename) { // TODO: Parse files with multiple query sequences IList<Match> mumList = new List<Match>(); try { using (TextReader tr = File.OpenText(filename)) { string line; while ((line = tr.ReadLine()) != null) { if (!line.StartsWith(">")) { string[] items = line.Trim().Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (items[0] != ">") { Match mum2 = new Match { ReferenceSequenceOffset = Convert.ToInt32(items[0]), QuerySequenceOffset = Convert.ToInt32(items[1]), Length = Convert.ToInt32(items[2]) }; mumList.Add(mum2); } } } } return mumList; } catch { throw new FileFormatException(Resources.FileNotInProperFormat); } }
/// <summary> /// Align the Gap by executing pairwise alignment. /// </summary> /// <param name="referenceSequence">Reference sequence.</param> /// <param name="querySequence">Query Sequence.</param> /// <param name="sequenceResult1">Editable sequence containing alignment first result.</param> /// <param name="sequenceResult2">Editable sequence containing alignment second result.</param> /// <param name="consensusResult">Editable sequence containing consensus sequence.</param> /// <param name="mum1">First MUM of Gap.</param> /// <param name="mum2">Second MUM of Gap.</param> /// <param name="insertions">Insertions made to the aligned sequences.</param> /// <returns>Score of alignment.</returns> private long AlignGap( ISequence referenceSequence, ISequence querySequence, List<byte> sequenceResult1, List<byte> sequenceResult2, List<byte> consensusResult, Match mum1, Match mum2, out List<long> insertions) { long score = 0; ISequence sequence1 = null; ISequence sequence2 = null; byte[] mum1String; byte[] mum2String; insertions = new List<long>(2); insertions.Add(0); insertions.Add(0); long mum1ReferenceStartIndex = 0; long mum1QueryStartIndex = 0; long mum1Length = 0; long mum2ReferenceStartIndex = 0; long mum2QueryStartIndex = 0; long mum2Length = 0; if (mum1.Length != 0) { mum1ReferenceStartIndex = mum1.ReferenceSequenceOffset; mum1QueryStartIndex = mum1.QuerySequenceOffset; mum1Length = mum1.Length; } if (mum2.Length != 0) { mum2ReferenceStartIndex = mum2.ReferenceSequenceOffset; mum2QueryStartIndex = mum2.QuerySequenceOffset; mum2Length = mum2.Length; } else { mum2ReferenceStartIndex = referenceSequence.Count; mum2QueryStartIndex = querySequence.Count; } long referenceGapStartIndex = mum1ReferenceStartIndex + mum1Length; long queryGapStartIndex = mum1QueryStartIndex + mum1Length; /* Stich the exact matches together according to if both sequences have data * in the gap (in which case use a global alignment) or if only one does * (in which case just insert gaps). */ if (mum2ReferenceStartIndex > referenceGapStartIndex && mum2QueryStartIndex > queryGapStartIndex) // Both sequences have data in the gap. { // Get the sequences in between sequence1 = referenceSequence.GetSubSequence( referenceGapStartIndex, mum2ReferenceStartIndex - referenceGapStartIndex); sequence2 = querySequence.GetSubSequence( queryGapStartIndex, mum2QueryStartIndex - queryGapStartIndex); // Do a pairwise alignment (must be needleman wunsh) var alignment = this.RunPairWiseReturnJustAlignment(sequence1, sequence2); sequenceResult1.AddRange(alignment.FirstSequence); sequenceResult2.AddRange(alignment.SecondSequence); consensusResult.AddRange(alignment.Consensus); score += alignment.Score; if (!alignment.Metadata.ContainsKey ("Insertions")) { // Should never happen - can remove later. throw new Exception ("NeedlemanWunsch alignment did not have an insertion entry"); } List<long> gapinsertions = alignment.Metadata ["Insertions"] as List<long>; if (gapinsertions == null || gapinsertions.Count != 2) { // Should never happen - can remove later throw new Exception("Alignment Insertions were not available as a size 2 list"); } insertions [0] += gapinsertions [0]; insertions [1] += gapinsertions [1]; } else if (mum2ReferenceStartIndex > referenceGapStartIndex) // Only the reference has data, insert gaps for the query { sequence1 = referenceSequence.GetSubSequence( referenceGapStartIndex, mum2ReferenceStartIndex - referenceGapStartIndex); sequenceResult1.AddRange(sequence1); sequenceResult2.AddRange(CreateDefaultGap(sequence1.Count)); consensusResult.AddRange(sequence1); insertions[1] += sequence1.Count; if (this.UseGapExtensionCost) { score = this.GapOpenCost + ((sequence1.Count - 1) * this.GapExtensionCost); } else { score = sequence1.Count * this.GapOpenCost; } } else if (mum2QueryStartIndex > queryGapStartIndex) // Only the query has data, insert gaps for the reference { sequence2 = querySequence.GetSubSequence( queryGapStartIndex, mum2QueryStartIndex - queryGapStartIndex); sequenceResult1.AddRange(CreateDefaultGap(sequence2.Count)); sequenceResult2.AddRange(sequence2); consensusResult.AddRange(sequence2); insertions[0] += sequence2.Count; if (this.UseGapExtensionCost) { score = this.GapOpenCost + ((sequence2.Count - 1) * this.GapExtensionCost); } else { score = sequence2.Count * this.GapOpenCost; } } // Add the MUM to the result if (0 < mum2Length) { mum1String = referenceSequence.GetSubSequence( mum2ReferenceStartIndex, mum2Length).ToArray(); sequenceResult1.AddRange(mum1String); mum2String = querySequence.GetSubSequence( mum2QueryStartIndex, mum2Length).ToArray(); sequenceResult2.AddRange(mum2String); consensusResult.AddRange(mum1String); foreach (byte index in mum1String) { score += SimilarityMatrix[index, index]; } } return score; }
public void TestMatchAndMatchExtensionToString() { Match match = new Match(); match.Length = 20; match.QuerySequenceOffset = 33; MatchExtension matchExtn = new MatchExtension(match); matchExtn.ID = 1; matchExtn.Length = 20; string actualMatchExtnString = matchExtn.ToString(); string actualMatchstring = match.ToString(); string ExpectedMatchExtnString = "RefStart=0 QueryStart=33 Length=20 Score=0 WrapScore=0 IsGood=False"; string ExpectedMatchString = "RefStart=0 QueryStart=33 Length=20"; Assert.AreEqual(ExpectedMatchExtnString, actualMatchExtnString); Assert.AreEqual(actualMatchstring, ExpectedMatchString); }
public void TestClusterToString() { Match match = new Match(); MatchExtension matchExtn1 = new MatchExtension(match); matchExtn1.ID = 1; matchExtn1.Length = 20; MatchExtension matchExtn2 = new MatchExtension(match); matchExtn2.ID = 2; matchExtn2.Length = 30; IList<MatchExtension> extnList = new List<MatchExtension>(); extnList.Add(matchExtn1); extnList.Add(matchExtn2); Cluster clust = new Cluster(extnList); string actualString = clust.ToString(); string expectedString = "RefStart=0 QueryStart=0 Length=20 Score=0 WrapScore=0 IsGood=False\r\nRefStart=0 QueryStart=0 Length=30 Score=0 WrapScore=0 IsGood=False\r\n".Replace ("\r\n", Environment.NewLine); Assert.AreEqual(actualString, expectedString); }