示例#1
0
        /// <summary>
        /// The only difference from SearchSubString is that the returning result is a List of IntexIndex.
        /// </summary>
        /// <param name="pairList"></param>
        /// <returns></returns>
        private List <IndexItem> SearchSubString4WildcardSearch(List <PairInfo> pairList)
        {
            List <GCHandle> gchandlers = null;

            if (!InMemory)
            {
                gchandlers = new List <GCHandle>();
            }
            List <IndexItem> Result = new List <IndexItem>();
            int pairCount           = pairList.Count;

            if (pairCount == 0)
            {
                return(Result);
            }
            IndexItem *[] IndexItemList = new IndexItem *[pairCount];
            int[]         DistanceList  = new int[pairCount];
            int[]         ItemCountList = new int[pairCount];
            int[]         IteratorList  = new int[pairCount];
            int           offset        = pairList[0].pos;

            for (int i = 0; i < pairCount; i++)
            {
                if (InMemory)
                {
                    IndexItemList[i] = ReadIndexItemListFromRAM(pairList[i].A, pairList[i].B, out ItemCountList[i]);
                }
                else
                {
                    IndexItemList[i] = ReadIndexItemListFromDisk(pairList[i].A, pairList[i].B, out ItemCountList[i], ref gchandlers);
                }
                DistanceList[i] = pairList[i].pos - offset;
                offset          = pairList[i].pos;
                if (ItemCountList[i] == 0)
                {
                    if (InMemory)
                    {
                        ReleaseGCHandle(gchandlers);
                    }
                    return(null);
                }
            }
            #region K-way search

            /****************************************************************************************************
             * Each element in the IndexItemList represents a bigram pair (e.g., ab or bc).
             * Each IndexItem is sorted by IndexItem.CellId and IndexItem.Offset
             * We perform k-way search starting from IndexItemList[0]
             * For a cellId at IndexItemList[0][iterator_0],
             * if we can successfully go through all the pairCount elements in IndexItemList,
             * then we found a match. Otherwise, we continue by checking the next elements in IndexItemList[0].
             *
             * Successfully going through all the elements in IndexItemList means:
             * The distance between the current item offset in IndexItemList[i] and
             * the current item offset in IndexItemList[i-1] equals DistanceList[i].
             * ***************************************************************************************************/
            int  iterator_0 = 0;
            long cellId     = long.MinValue;
            offset = 0;
            do
            {
                IndexItem current_item = IndexItemList[0][iterator_0];
                if (current_item.CellId >= cellId)
                {
                    offset = current_item.Offset;
                    int i = 1;
                    for (; i < pairCount; i++)
                    {
                        IndexItem item  = new IndexItem();
                        bool      match = false;
                        int       j     = IteratorList[i];
                        for (; j < ItemCountList[i]; j++)
                        {
                            item = IndexItemList[i][j];
                            if (item.CellId >= current_item.CellId)
                            {
                                break;
                            }
                        }
                        if (j == ItemCountList[i])
                        {
                            if (InMemory)
                            {
                                ReleaseGCHandle(gchandlers);
                            }
                            return(Result);
                        }
                        IteratorList[i] = j;
                        if (item.CellId > current_item.CellId)
                        {
                            cellId = item.CellId;
                            iterator_0++;
                            break;
                        }
                        Debug.Assert(item.CellId == current_item.CellId);
                        do
                        {
                            item = IndexItemList[i][j];
                            int distance = item.Offset - offset;
                            if (distance == DistanceList[i])
                            {
                                offset = item.Offset;
                                match  = true;
                                break;
                            }
                            else if (distance > DistanceList[i])
                            {
                                break;
                            }
                            j++;
                        } while (j < ItemCountList[i] && IndexItemList[i][j].CellId == current_item.CellId);
                        if (!match)
                        {
                            iterator_0++;
                            break;
                        }
                    }
                    if (i == pairCount)
                    {
                        Result.Add(current_item);
                        while (iterator_0 < ItemCountList[0] && current_item.CellId == IndexItemList[0][iterator_0].CellId)
                        {
                            iterator_0++;
                        }
                    }
                }
                else
                {
                    iterator_0++;
                }
            } while (iterator_0 < ItemCountList[0]);
            #endregion
            if (InMemory)
            {
                ReleaseGCHandle(gchandlers);
            }
            return(Result);
        }
示例#2
0
        /// <summary>
        /// Performs a substring search using the specified keywords. The match pattern is:
        /// keywords[0]*keywords[1]..., where * is the wildcard symbol.
        /// </summary>
        /// <param name="keywords">A list of keywords.</param>
        /// <returns>A list of matched cell Ids.</returns>
        internal List <long> SubstringSearch(params string[] keywords)
        {
            if (keywords == null || keywords.Length == 0)
            {
                return(new List <long>(0));
            }
            if (keywords.Length == 1)
            {
                return(SubstringSearch(keywords[0]));
            }
            List <long> Result = new List <long>();
            List <List <IndexItem> > partialResults = new List <List <IndexItem> >(keywords.Length);

            for (int i = 0; i < keywords.Length; i++)
            {
                string query = keywords[i];
                if (query.Trim().Length == 0)
                {
                    return(new List <long>(0));
                }
                string q     = query.ToLower();
                byte[] bytes = Encoding.UTF8.GetBytes(q);
                if (bytes.Length < 2 || bytes.Length > 255)
                {
                    return(new List <long>(0));
                }
                List <PairInfo> pairList = SplitKeyword(bytes);
                if (pairList.Count == 0)
                {
                    return(new List <long>(0));
                }
                var _result = SearchSubString4WildcardSearch(pairList);
                if (_result == null || _result.Count == 0)
                {
                    return(new List <long>(0));
                }
                partialResults.Add(_result);
            }
            #region Another K-way search, similar to that of SearchSubString

            /****************************************************************************************************
             * Each element in the partialResults represents the result list of a substring in the keywords list.
             * Each IndexItem list is sorted by IndexItem.CellId and IndexItem.Offset
             * We perform k-way search starting from the first indexitem of the first list
             * For a cellId at partialResults[0][iterator_0],
             * if we can successfully go through all the pairCount elements in IndexItemList,
             * then we found a match. Otherwise, we continue by checking the next elements in IndexItemList[0].
             *
             * Successfully going through all the elements in IndexItemList means:
             * The distance between the current item offset in partialResults[i] and
             * the current item offset in partialResults[i-1] >= keywords[i-1].Length.
             * ***************************************************************************************************/
            int   keywordCount = keywords.Length;
            int[] IteratorList = new int[keywordCount];
            int   iterator_0   = 0;
            long  cellId       = long.MinValue;
            int   offset       = 0;
            do
            {
                IndexItem current_item = partialResults[0][iterator_0];
                if (current_item.CellId >= cellId)
                {
                    offset = current_item.Offset;
                    int i = 1;
                    for (; i < keywordCount; i++)
                    {
                        IndexItem item  = new IndexItem();
                        bool      match = false;
                        int       j     = IteratorList[i];
                        for (; j < partialResults[i].Count; j++)
                        {
                            item = partialResults[i][j];
                            if (item.CellId >= current_item.CellId)
                            {
                                break;
                            }
                        }
                        if (j == partialResults[i].Count)
                        {
                            return(Result);
                        }
                        IteratorList[i] = j;
                        if (item.CellId > current_item.CellId)
                        {
                            cellId = item.CellId;
                            iterator_0++;
                            break;
                        }
                        Debug.Assert(item.CellId == current_item.CellId);
                        do
                        {
                            item = partialResults[i][j];
                            int distance = item.Offset - offset;
                            if (distance >= keywords[i - 1].Length)
                            {
                                offset = item.Offset;
                                match  = true;
                                break;
                            }
                            j++;
                        } while (j < partialResults[i].Count && partialResults[i][j].CellId == current_item.CellId);
                        if (!match)
                        {
                            iterator_0++;
                            break;
                        }
                    }
                    if (i == keywordCount)
                    {
                        Result.Add(current_item.CellId);
                        while (iterator_0 < partialResults[0].Count && current_item.CellId == partialResults[0][iterator_0].CellId)
                        {
                            iterator_0++;
                        }
                    }
                }
                else
                {
                    iterator_0++;
                }
            } while (iterator_0 < partialResults[0].Count);
            #endregion
            return(Result);
        }