/// <summary> /// The only difference from SearchSubString is that the returning result is a List of IntexIndex. /// </summary> /// <param name="pairList"></param> /// <returns></returns> private List <IndexItem> SearchSubString4WildcardSearch(List <PairInfo> pairList) { List <GCHandle> gchandlers = null; if (!InMemory) { gchandlers = new List <GCHandle>(); } List <IndexItem> Result = new List <IndexItem>(); int pairCount = pairList.Count; if (pairCount == 0) { return(Result); } IndexItem *[] IndexItemList = new IndexItem *[pairCount]; int[] DistanceList = new int[pairCount]; int[] ItemCountList = new int[pairCount]; int[] IteratorList = new int[pairCount]; // with default initialized value 0 int offset = pairList[0].pos; for (int i = 0; i < pairCount; i++) { if (InMemory) { IndexItemList[i] = ReadIndexItemListFromRAM(pairList[i].A, pairList[i].B, out ItemCountList[i]); } else { IndexItemList[i] = ReadIndexItemListFromDisk(pairList[i].A, pairList[i].B, out ItemCountList[i], ref gchandlers); } DistanceList[i] = pairList[i].pos - offset; offset = pairList[i].pos; if (ItemCountList[i] == 0) { if (InMemory) { ReleaseGCHandle(gchandlers); } return(null); } } #region K-way search /**************************************************************************************************** * Each element in the IndexItemList represents a bigram pair (e.g., ab or bc). * Each IndexItem is sorted by IndexItem.m_cellId and IndexItem.Offset * We perform k-way search starting from IndexItemList[0] * For a cellId at IndexItemList[0][iterator_0], * if we can successfully go through all the pairCount elements in IndexItemList, * then we found a match. Otherwise, we continue by checking the next elements in IndexItemList[0]. * * Successfully going through all the elements in IndexItemList means: * The distance between the current item offset in IndexItemList[i] and * the current item offset in IndexItemList[i-1] equals DistanceList[i]. * ***************************************************************************************************/ int iterator_0 = 0; // iterator for the first IndexItem list IndexItemList[0] long cellId = long.MinValue; offset = 0; do { IndexItem current_item = IndexItemList[0][iterator_0]; if (current_item.m_cellId >= cellId) { offset = current_item.Offset; int i = 1; for (; i < pairCount; i++) { IndexItem item = new IndexItem(); bool match = false; int j = IteratorList[i]; // skip the items whose CellIds are smaller than the current CellId for (; j < ItemCountList[i]; j++) { item = IndexItemList[i][j]; if (item.m_cellId >= current_item.m_cellId) { break; } } if (j == ItemCountList[i]) // We reach the end of one index item list for a bigram pair { if (InMemory) { ReleaseGCHandle(gchandlers); } return(Result); } IteratorList[i] = j; if (item.m_cellId > current_item.m_cellId) // did not find any item whose CellId equals the current CellId { cellId = item.m_cellId; iterator_0++; // jump to the first index item, and check the next index item break; } // _item.CellId == item.CellId Debug.Assert(item.m_cellId == current_item.m_cellId); do { item = IndexItemList[i][j]; int distance = item.Offset - offset; if (distance == DistanceList[i]) { offset = item.Offset; match = true; break; } else if (distance > DistanceList[i]) { break; } j++; } while (j < ItemCountList[i] && IndexItemList[i][j].m_cellId == current_item.m_cellId); if (!match) { iterator_0++; break; } } if (i == pairCount) { Result.Add(current_item); while (iterator_0 < ItemCountList[0] && current_item.m_cellId == IndexItemList[0][iterator_0].m_cellId) { iterator_0++; } } } else { iterator_0++; // skip current item if its CellId is smaller than the current CellId } } while (iterator_0 < ItemCountList[0]); #endregion if (InMemory) { ReleaseGCHandle(gchandlers); } return(Result); }
/// <summary> /// Performs a substring search using the specified keywords. The match pattern is: /// keywords[0]*keywords[1]..., where * is the wildcard symbol. /// </summary> /// <param name="keywords">A list of keywords.</param> /// <returns>A list of matched cell Ids.</returns> internal List <long> SubstringSearch(params string[] keywords) { if (keywords == null || keywords.Length == 0) { return(new List <long>(0)); } if (keywords.Length == 1) { return(SubstringSearch(keywords[0])); } List <long> Result = new List <long>(); List <List <IndexItem> > partialResults = new List <List <IndexItem> >(keywords.Length); for (int i = 0; i < keywords.Length; i++) { string query = keywords[i]; if (query.Trim().Length == 0) { return(new List <long>(0)); } string q = query.ToLowerInvariant(); byte[] bytes = Encoding.UTF8.GetBytes(q); if (bytes.Length < 2 || bytes.Length > 255) { return(new List <long>(0)); } List <PairInfo> pairList = SplitKeyword(bytes); if (pairList.Count == 0) { return(new List <long>(0)); } var _result = SearchSubString4WildcardSearch(pairList); if (_result == null || _result.Count == 0) { return(new List <long>(0)); } partialResults.Add(_result); } #region Another K-way search, similar to that of SearchSubString /**************************************************************************************************** * Each element in the partialResults represents the result list of a substring in the keywords list. * Each IndexItem list is sorted by IndexItem.m_cellId and IndexItem.Offset * We perform k-way search starting from the first indexitem of the first list * For a cellId at partialResults[0][iterator_0], * if we can successfully go through all the pairCount elements in IndexItemList, * then we found a match. Otherwise, we continue by checking the next elements in IndexItemList[0]. * * Successfully going through all the elements in IndexItemList means: * The distance between the current item offset in partialResults[i] and * the current item offset in partialResults[i-1] >= keywords[i-1].Length. * ***************************************************************************************************/ int keywordCount = keywords.Length; int[] IteratorList = new int[keywordCount]; // with default initialized value 0 int iterator_0 = 0; // iterator for the first IndexItem list long cellId = long.MinValue; int offset = 0; do { IndexItem current_item = partialResults[0][iterator_0]; if (current_item.m_cellId >= cellId) { offset = current_item.Offset; int i = 1; for (; i < keywordCount; i++) { IndexItem item = new IndexItem(); bool match = false; int j = IteratorList[i]; // skip the items whose CellIds are smaller than the current CellId for (; j < partialResults[i].Count; j++) { item = partialResults[i][j]; if (item.m_cellId >= current_item.m_cellId) { break; } } if (j == partialResults[i].Count) // We reach the end of the current index item list { return(Result); } IteratorList[i] = j; if (item.m_cellId > current_item.m_cellId) // did not find any item whose CellId equals the current CellId { cellId = item.m_cellId; iterator_0++; // jump to the first index item, and check the next index item break; } // _item.m_cellId == item.m_cellId Debug.Assert(item.m_cellId == current_item.m_cellId); do { item = partialResults[i][j]; int distance = item.Offset - offset; if (distance >= keywords[i - 1].Length) { offset = item.Offset; match = true; break; } j++; } while (j < partialResults[i].Count && partialResults[i][j].m_cellId == current_item.m_cellId); if (!match) { iterator_0++; break; } } if (i == keywordCount) { Result.Add(current_item.m_cellId); while (iterator_0 < partialResults[0].Count && current_item.m_cellId == partialResults[0][iterator_0].m_cellId) { iterator_0++; } } } else { iterator_0++; // skip current item if its m_cellId is smaller than the current m_cellId } } while (iterator_0 < partialResults[0].Count); #endregion return(Result); }