Exemple #1
0
        void SortBigram(string file, int a, int b)
        {
            if (!File.Exists(file))
            {
                return;
            }
            using (FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.Read, 1 << 10, false))
            {
                //!!!!! Be aware of this line
                byte[] buffer = new byte[(int)fs.Length];
                fs.Read(buffer, 0, buffer.Length);
                int         count      = buffer.Length / sizeof(IndexItem);
                IndexItem[] indexItems = new IndexItem[count];
                fixed(byte *bp = buffer)
                {
                    IndexItem *p = (IndexItem *)bp;

                    for (int i = 0; i < count; i++)
                    {
                        indexItems[i] = *(p + i);
                    }

                    Array.Sort(indexItems, new Comparison <IndexItem>((x, y) =>
                    {
                        int ret = x.m_cellId.CompareTo(y.m_cellId);
                        if (ret != 0)
                        {
                            return(ret);
                        }

                        return(x.Offset.CompareTo(y.Offset));
                    }
                                                                      ));

                    for (int i = 0; i < count; i++)
                    {
                        *(p + i) = indexItems[i];
                    }
                }

                bw_index.Write(count);
                bw_index.Write(buffer, 0, buffer.Length);
                bg_index[(a << 8) + b] = IndexFileOffset;
                IndexFileOffset       += (sizeof(int) + buffer.Length);
            }
        }
        unsafe void FlushIndex(int a, int b)
        {
            int idx = (a << 8) + b;
            string bigram_index_file = string.Format(CultureInfo.InvariantCulture, "{0}\\{1}\\{2}", temp_dir, a, b);

            byte[] buffer = new byte[indexItemCount[idx] * sizeof(IndexItem)];
            fixed (byte* bp = buffer)
            {
                IndexItem* p = (IndexItem*)bp;
                for (int i = 0; i < indexItemCount[idx]; i++)
                    *(p + i) = index[idx][i];
            }

            using (FileStream fs = new FileStream(bigram_index_file, FileMode.Append, FileAccess.Write, FileShare.Write, 1 << 10, false))
            {
                fs.Write(buffer, 0, buffer.Length);
            }

            indexItemCount[idx] = 0;
        }
Exemple #3
0
        /// <summary>
        /// The only difference from SearchSubString is that the returning result is a List of IntexIndex.
        /// </summary>
        /// <param name="pairList"></param>
        /// <returns></returns>
        private List <IndexItem> SearchSubString4WildcardSearch(List <PairInfo> pairList)
        {
            List <GCHandle> gchandlers = null;

            if (!InMemory)
            {
                gchandlers = new List <GCHandle>();
            }

            List <IndexItem> Result = new List <IndexItem>();
            int pairCount           = pairList.Count;

            if (pairCount == 0)
            {
                return(Result);
            }

            IndexItem *[] IndexItemList = new IndexItem *[pairCount];
            int[]         DistanceList  = new int[pairCount];
            int[]         ItemCountList = new int[pairCount];
            int[]         IteratorList  = new int[pairCount]; // with default initialized value 0

            int offset = pairList[0].pos;

            for (int i = 0; i < pairCount; i++)
            {
                if (InMemory)
                {
                    IndexItemList[i] = ReadIndexItemListFromRAM(pairList[i].A, pairList[i].B, out ItemCountList[i]);
                }
                else
                {
                    IndexItemList[i] = ReadIndexItemListFromDisk(pairList[i].A, pairList[i].B, out ItemCountList[i], ref gchandlers);
                }
                DistanceList[i] = pairList[i].pos - offset;
                offset          = pairList[i].pos;
                if (ItemCountList[i] == 0)
                {
                    if (InMemory)
                    {
                        ReleaseGCHandle(gchandlers);
                    }
                    return(null);
                }
            }

            #region K-way search

            /****************************************************************************************************
             * Each element in the IndexItemList represents a bigram pair (e.g., ab or bc).
             * Each IndexItem is sorted by IndexItem.m_cellId and IndexItem.Offset
             * We perform k-way search starting from IndexItemList[0]
             * For a cellId at IndexItemList[0][iterator_0],
             * if we can successfully go through all the pairCount elements in IndexItemList,
             * then we found a match. Otherwise, we continue by checking the next elements in IndexItemList[0].
             *
             * Successfully going through all the elements in IndexItemList means:
             * The distance between the current item offset in IndexItemList[i] and
             * the current item offset in IndexItemList[i-1] equals DistanceList[i].
             * ***************************************************************************************************/
            int  iterator_0 = 0; // iterator for the first IndexItem list IndexItemList[0]
            long cellId     = long.MinValue;
            offset = 0;
            do
            {
                IndexItem current_item = IndexItemList[0][iterator_0];
                if (current_item.m_cellId >= cellId)
                {
                    offset = current_item.Offset;
                    int i = 1;
                    for (; i < pairCount; i++)
                    {
                        IndexItem item  = new IndexItem();
                        bool      match = false;
                        int       j     = IteratorList[i];
                        // skip the items whose CellIds are smaller than the current CellId
                        for (; j < ItemCountList[i]; j++)
                        {
                            item = IndexItemList[i][j];
                            if (item.m_cellId >= current_item.m_cellId)
                            {
                                break;
                            }
                        }
                        if (j == ItemCountList[i]) // We reach the end of one index item list for a bigram pair
                        {
                            if (InMemory)
                            {
                                ReleaseGCHandle(gchandlers);
                            }
                            return(Result);
                        }

                        IteratorList[i] = j;
                        if (item.m_cellId > current_item.m_cellId) // did not find any item whose CellId equals the current CellId
                        {
                            cellId = item.m_cellId;
                            iterator_0++; // jump to the first index item, and check the next index item
                            break;
                        }

                        // _item.CellId == item.CellId
                        Debug.Assert(item.m_cellId == current_item.m_cellId);
                        do
                        {
                            item = IndexItemList[i][j];
                            int distance = item.Offset - offset;
                            if (distance == DistanceList[i])
                            {
                                offset = item.Offset;
                                match  = true;
                                break;
                            }
                            else if (distance > DistanceList[i])
                            {
                                break;
                            }
                            j++;
                        } while (j < ItemCountList[i] && IndexItemList[i][j].m_cellId == current_item.m_cellId);

                        if (!match)
                        {
                            iterator_0++;
                            break;
                        }
                    }
                    if (i == pairCount)
                    {
                        Result.Add(current_item);
                        while (iterator_0 < ItemCountList[0] && current_item.m_cellId == IndexItemList[0][iterator_0].m_cellId)
                        {
                            iterator_0++;
                        }
                    }
                }
                else
                {
                    iterator_0++; // skip current item if its CellId is smaller than the current CellId
                }
            } while (iterator_0 < ItemCountList[0]);
            #endregion
            if (InMemory)
            {
                ReleaseGCHandle(gchandlers);
            }
            return(Result);
        }