//======================================================// public static void BuildIndexInt32OnDisk(string dataFileFullName, Encoding dataFileEncoding, uint hashTableSize, NormlizeTextFunction normlizeTextFunction = null) { //create header-of-disk-hashtable var indexHeader = new HashtableIndexFileHeader ( dataFileFullName, dataFileEncoding, hashTableSize, HashtableIndexFileHeader.SlotType.Int32 ); //create index-file using (var indexFileStream = IndexFileHelper.CreateFileStreamReadWrite(indexHeader.GetIndexFileFullName())) using (var diskSlotInt32ReadBuffer = new ReadBuffer <DiskSlotInt32>()) using (var diskTagInt32ReadBuffer = new ReadBuffer <DiskTagInt32>()) { //Write header in index file indexFileStream.SeekFromBegin(0); indexHeader.SerializeIndexHeader(indexFileStream); Int64 tagAreaOffset = (indexHeader.SizeOf + indexHeader.HashtableSize * DiskSlotInt32.SizeOf); //Guard from tagAreaOffset > MemorySlotInt32.MaxValue if (tagAreaOffset > MemorySlotInt32.MaxValue) { throw (new InvalidOperationException("tag area offset more then allow possible value => tag area offset: " + tagAreaOffset + ", possible value: " + MemorySlotInt32.MaxValue)); } //fill disk-hashtable empty slot's byte[] emptyDiskSlotBytes = DiskSlotInt32.GetEmptyDiskSlot().StructureToByteArray(); for (var i = 0U; i < hashTableSize; i++) { indexFileStream.WriteBytes(emptyDiskSlotBytes); } DiskTagInt32 diskTag = new DiskTagInt32(); //open data-file using (var dataFileTextLineReader = new TextLineReader(indexHeader.DataFileFullName, indexHeader.DataFileEncoding)) { #region [.build index on disk.] int dataRecordCount = 0; int dataRecordMaxLenght = 0; while (!dataFileTextLineReader.EndOfStream) { Int64 _posInt64 = dataFileTextLineReader.StreamPosition; //Guard from file-pointer > MemorySlotInt32.MaxValue if (_posInt64 > MemorySlotInt32.MaxValue) { throw (new InvalidOperationException("file-pointer more then allow possible value => file-pointer: " + _posInt64 + ", possible value: " + MemorySlotInt32.MaxValue)); } Int32 position = (Int32)_posInt64; var text = dataFileTextLineReader.ReadLine4Indexing(); if (text.IsEmptyOrNull()) { continue; } //Normlize text if allowed if (normlizeTextFunction != null) { text = normlizeTextFunction(text); if (text.IsEmptyOrNull()) { continue; } } uint hashCode = IndexFileHelper.HashFunction(text, indexHeader.HashtableSize); #region [.code.] DiskSlotInt32 diskSlot = indexFileStream.SeekAndReadDiskSlotInt32 ( indexHeader.SizeOf + hashCode * DiskSlotInt32.SizeOf, diskSlotInt32ReadBuffer ); //Free slot => search text not found if (diskSlot.PositionInDataFile == Consts.FREE_INT32) { //Calc data-slot properties diskSlot.PositionInDataFile = position; diskSlot.FirstTagOffset = 0; //Write in index-file data-slot indexFileStream.SeekFromCurrentAndWrite ( -diskSlotInt32ReadBuffer.Size, diskSlot.StructureToByteArray() ); } else { if (diskSlot.FirstTagOffset == 0) { diskSlot.FirstTagOffset = (Int32)tagAreaOffset; //ReWrite in index-file data-slot indexFileStream.SeekFromCurrentAndWrite ( -diskSlotInt32ReadBuffer.Size, diskSlot.StructureToByteArray() ); } else //if ( diskSlot.FirstTagOffset != 0 ) { //Search in hashtable tag chain's diskTag.NextTagOffset = diskSlot.FirstTagOffset; while (0 < diskTag.NextTagOffset) { diskTag = indexFileStream.SeekAndReadDiskTagInt32(diskTag.NextTagOffset, diskTagInt32ReadBuffer); } //Calc disk-tag properties diskTag.PositionInDataFile = position; diskTag.NextTagOffset = (Int32)tagAreaOffset; //ReWrite previous tag in chain indexFileStream.SeekFromCurrentAndWrite ( -diskTagInt32ReadBuffer.Size, diskTag.StructureToByteArray() ); } //Calc disk-tag properties diskTag.PositionInDataFile = position; diskTag.NextTagOffset = 0; //Write in index-file indexFileStream.SeekAndWrite ( tagAreaOffset, diskTag.StructureToByteArray() ); //Calc disk-tag offset tagAreaOffset += DiskTagInt32.SizeOf; //Guard from tagAreaOffset > MemorySlotInt32.MaxValue if (tagAreaOffset > MemorySlotInt32.MaxValue) { throw (new InvalidOperationException("tag area offset more then allow possible value => tag area offset: " + tagAreaOffset + ", possible value: " + MemorySlotInt32.MaxValue)); } } #endregion dataRecordCount++; if (dataRecordMaxLenght < text.Length) { dataRecordMaxLenght = text.Length; //dataRecordMaxLenght = Math.Max( dataRecordMaxLenght, text.Length ); } } indexHeader.SetDataRecordCount(dataRecordCount); indexHeader.SetDataRecordMaxBytesLenght(indexHeader.DataFileEncoding.GetMaxByteCount(dataRecordMaxLenght)); #endregion } //one more time - Write header in index file indexFileStream.SeekFromBegin(0); indexHeader.SerializeIndexHeader(indexFileStream); } }
/// <summary> /// Int32 file-pointer version /// </summary> /// <param name="dataFileFullName"></param> /// <param name="dataFileEncoding"></param> /// <param name="hashTableSize"></param> private static void BuildIndexInt32InMemory(string dataFileFullName, Encoding dataFileEncoding, uint hashTableSize, NormlizeTextFunction normlizeTextFunction) { // var indexHeader = new HashtableIndexFileHeader ( dataFileFullName, dataFileEncoding, hashTableSize, HashtableIndexFileHeader.SlotType.Int32 ); // var hashTable = new MemorySlotInt32[indexHeader.HashtableSize]; // using (var dataFileTextLineReader = new TextLineReader(indexHeader.DataFileFullName, indexHeader.DataFileEncoding)) { #region [.2 calc index in memory.] int dataRecordCount = 0; int dataRecordMaxLenght = 0; while (!dataFileTextLineReader.EndOfStream) { Int64 _posInt64 = dataFileTextLineReader.StreamPosition; //Guard from file-pointer > MemorySlotInt32.MaxValue if (_posInt64 > MemorySlotInt32.MaxValue) { throw (new InvalidOperationException("file-pointer more then allow possible value => file-pointer: " + _posInt64 + ", possible value: " + MemorySlotInt32.MaxValue)); } Int32 position = (Int32)_posInt64; var text = dataFileTextLineReader.ReadLine4Indexing(); if (text.IsEmptyOrNull()) { continue; } //Normlize text if allowed if (normlizeTextFunction != null) { text = normlizeTextFunction(text); if (text.IsEmptyOrNull()) { continue; } } uint hashCode = IndexFileHelper.HashFunction(text, indexHeader.HashtableSize); MemorySlotInt32 memorySlot = hashTable[hashCode]; if (memorySlot == null) { hashTable[hashCode] = new MemorySlotInt32(position); } else { MemoryTagInt32 newMemoryTag = new MemoryTagInt32(position); if (memorySlot.FirstMemoryTag == null) { memorySlot.FirstMemoryTag = newMemoryTag; } else { MemoryTagInt32 memoryTag = memorySlot.FirstMemoryTag; while (memoryTag.NextMemoryTag != null) { memoryTag = memoryTag.NextMemoryTag; } memoryTag.NextMemoryTag = newMemoryTag; } } dataRecordCount++; if (dataRecordMaxLenght < text.Length) { dataRecordMaxLenght = text.Length; //dataRecordMaxLenght = Math.Max( dataRecordMaxLenght, text.Length ); } } indexHeader.SetDataRecordCount(dataRecordCount); indexHeader.SetDataRecordMaxBytesLenght(indexHeader.DataFileEncoding.GetMaxByteCount(dataRecordMaxLenght)); #endregion } #region [.Calulate Tag collision statistica.] foreach (var memorySlot in hashTable) { if (memorySlot == null) { //empty slot's in hash-table indexHeader.TagCollisionStatistica.IncremetByKey(-1); } else { //Current tag chain depth int currentTagChainDepth = 0; //Get first memory-tag MemoryTagInt32 memoryTag = memorySlot.FirstMemoryTag; while (memoryTag != null) { //Get next tag memoryTag = memoryTag.NextMemoryTag; //Current tag chain depth currentTagChainDepth++; } // indexHeader.TagCollisionStatistica.IncremetByKey(currentTagChainDepth); } } #endregion // using (var indexFileBinaryWriter = IndexFileHelper.CreateBinaryWriterRandomAccess(indexHeader.GetIndexFileFullName())) { #region [.3 write hash table on disk.] var diskSlot = new DiskSlotInt32(); var diskTag = new DiskTagInt32(); byte[] emptyDiskSlotBytes = DiskSlotInt32.GetEmptyDiskSlot().StructureToByteArray(); //Write header in index file indexFileBinaryWriter.SeekFromBegin(0); indexHeader.SerializeIndexHeader(indexFileBinaryWriter); Int64 tagAreaOffset = (indexHeader.SizeOf + indexHeader.HashtableSize * DiskSlotInt32.SizeOf); //Guard from tagAreaOffset > MemorySlotInt32.MaxValue if (tagAreaOffset > MemorySlotInt32.MaxValue) { throw (new InvalidOperationException("tag area offset more then allow possible value => tag area offset: " + tagAreaOffset + ", possible value: " + MemorySlotInt32.MaxValue)); } #region [.iterate over memory-slot's.] uint hashTableItemIndex = 0; foreach (var memorySlot in hashTable) { //Empty hash-table slot if (memorySlot == null) { //Write in index-file empty slot indexFileBinaryWriter.SeekAndWrite ( indexHeader.SizeOf + hashTableItemIndex * DiskSlotInt32.SizeOf, emptyDiskSlotBytes ); } //Have a data hash-table slot else { //Calc data-slot properties diskSlot.PositionInDataFile = memorySlot.PositionInDataFile; diskSlot.FirstTagOffset = (memorySlot.FirstMemoryTag != null) ? (Int32)tagAreaOffset : 0; //Write in index-file data-slot indexFileBinaryWriter.SeekAndWrite ( indexHeader.SizeOf + hashTableItemIndex * DiskSlotInt32.SizeOf, diskSlot.StructureToByteArray() ); //Calc Tag collision statistica int currentTagChainDepth = 0; //Get first memory-tag MemoryTagInt32 memoryTag = memorySlot.FirstMemoryTag; while (memoryTag != null) { //Calc disk-tag properties diskTag.PositionInDataFile = memoryTag.PositionInDataFile; diskTag.NextTagOffset = (memoryTag.NextMemoryTag != null) ? (Int32)tagAreaOffset + DiskTagInt32.SizeOf : 0; //Write in index-file indexFileBinaryWriter.SeekAndWrite ( tagAreaOffset, diskTag.StructureToByteArray() ); //Calc disk-tag offset tagAreaOffset += DiskTagInt32.SizeOf; //Guard from tagAreaOffset > MemorySlotInt32.MaxValue if (tagAreaOffset > MemorySlotInt32.MaxValue) { throw (new InvalidOperationException("tag area offset more then allow possible value => tag area offset: " + tagAreaOffset + ", possible value: " + MemorySlotInt32.MaxValue)); } //Get next tag memoryTag = memoryTag.NextMemoryTag; //Calc Tag collision statistica currentTagChainDepth++; } /* * //Calulate Tag collision statistica * //---indexHeader.TagCollisionStatistica.IncremetByKey( currentTagChainDepth /*memoryHash.GetTagChainDepth()* / ); */ } hashTableItemIndex++; } #endregion #endregion } //Free memory hashTable = null; GC.Collect(); }