void SortBigram(string file, int a, int b) { if (!File.Exists(file)) { return; } using (FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.Read, 1 << 10, false)) { //!!!!! Be aware of this line byte[] buffer = new byte[(int)fs.Length]; fs.Read(buffer, 0, buffer.Length); int count = buffer.Length / sizeof(IndexItem); IndexItem[] indexItems = new IndexItem[count]; fixed(byte *bp = buffer) { IndexItem *p = (IndexItem *)bp; for (int i = 0; i < count; i++) { indexItems[i] = *(p + i); } Array.Sort(indexItems, new Comparison <IndexItem>((x, y) => { int ret = x.m_cellId.CompareTo(y.m_cellId); if (ret != 0) { return(ret); } return(x.Offset.CompareTo(y.Offset)); } )); for (int i = 0; i < count; i++) { *(p + i) = indexItems[i]; } } bw_index.Write(count); bw_index.Write(buffer, 0, buffer.Length); bg_index[(a << 8) + b] = IndexFileOffset; IndexFileOffset += (sizeof(int) + buffer.Length); } }
unsafe void FlushIndex(int a, int b) { int idx = (a << 8) + b; string bigram_index_file = string.Format(CultureInfo.InvariantCulture, "{0}\\{1}\\{2}", temp_dir, a, b); byte[] buffer = new byte[indexItemCount[idx] * sizeof(IndexItem)]; fixed (byte* bp = buffer) { IndexItem* p = (IndexItem*)bp; for (int i = 0; i < indexItemCount[idx]; i++) *(p + i) = index[idx][i]; } using (FileStream fs = new FileStream(bigram_index_file, FileMode.Append, FileAccess.Write, FileShare.Write, 1 << 10, false)) { fs.Write(buffer, 0, buffer.Length); } indexItemCount[idx] = 0; }
/// <summary> /// The only difference from SearchSubString is that the returning result is a List of IntexIndex. /// </summary> /// <param name="pairList"></param> /// <returns></returns> private List <IndexItem> SearchSubString4WildcardSearch(List <PairInfo> pairList) { List <GCHandle> gchandlers = null; if (!InMemory) { gchandlers = new List <GCHandle>(); } List <IndexItem> Result = new List <IndexItem>(); int pairCount = pairList.Count; if (pairCount == 0) { return(Result); } IndexItem *[] IndexItemList = new IndexItem *[pairCount]; int[] DistanceList = new int[pairCount]; int[] ItemCountList = new int[pairCount]; int[] IteratorList = new int[pairCount]; // with default initialized value 0 int offset = pairList[0].pos; for (int i = 0; i < pairCount; i++) { if (InMemory) { IndexItemList[i] = ReadIndexItemListFromRAM(pairList[i].A, pairList[i].B, out ItemCountList[i]); } else { IndexItemList[i] = ReadIndexItemListFromDisk(pairList[i].A, pairList[i].B, out ItemCountList[i], ref gchandlers); } DistanceList[i] = pairList[i].pos - offset; offset = pairList[i].pos; if (ItemCountList[i] == 0) { if (InMemory) { ReleaseGCHandle(gchandlers); } return(null); } } #region K-way search /**************************************************************************************************** * Each element in the IndexItemList represents a bigram pair (e.g., ab or bc). * Each IndexItem is sorted by IndexItem.m_cellId and IndexItem.Offset * We perform k-way search starting from IndexItemList[0] * For a cellId at IndexItemList[0][iterator_0], * if we can successfully go through all the pairCount elements in IndexItemList, * then we found a match. Otherwise, we continue by checking the next elements in IndexItemList[0]. * * Successfully going through all the elements in IndexItemList means: * The distance between the current item offset in IndexItemList[i] and * the current item offset in IndexItemList[i-1] equals DistanceList[i]. * ***************************************************************************************************/ int iterator_0 = 0; // iterator for the first IndexItem list IndexItemList[0] long cellId = long.MinValue; offset = 0; do { IndexItem current_item = IndexItemList[0][iterator_0]; if (current_item.m_cellId >= cellId) { offset = current_item.Offset; int i = 1; for (; i < pairCount; i++) { IndexItem item = new IndexItem(); bool match = false; int j = IteratorList[i]; // skip the items whose CellIds are smaller than the current CellId for (; j < ItemCountList[i]; j++) { item = IndexItemList[i][j]; if (item.m_cellId >= current_item.m_cellId) { break; } } if (j == ItemCountList[i]) // We reach the end of one index item list for a bigram pair { if (InMemory) { ReleaseGCHandle(gchandlers); } return(Result); } IteratorList[i] = j; if (item.m_cellId > current_item.m_cellId) // did not find any item whose CellId equals the current CellId { cellId = item.m_cellId; iterator_0++; // jump to the first index item, and check the next index item break; } // _item.CellId == item.CellId Debug.Assert(item.m_cellId == current_item.m_cellId); do { item = IndexItemList[i][j]; int distance = item.Offset - offset; if (distance == DistanceList[i]) { offset = item.Offset; match = true; break; } else if (distance > DistanceList[i]) { break; } j++; } while (j < ItemCountList[i] && IndexItemList[i][j].m_cellId == current_item.m_cellId); if (!match) { iterator_0++; break; } } if (i == pairCount) { Result.Add(current_item); while (iterator_0 < ItemCountList[0] && current_item.m_cellId == IndexItemList[0][iterator_0].m_cellId) { iterator_0++; } } } else { iterator_0++; // skip current item if its CellId is smaller than the current CellId } } while (iterator_0 < ItemCountList[0]); #endregion if (InMemory) { ReleaseGCHandle(gchandlers); } return(Result); }