//======================================================//
        public static void BuildIndexInt32OnDisk(string dataFileFullName, Encoding dataFileEncoding, uint hashTableSize, NormlizeTextFunction normlizeTextFunction = null)
        {
            //create header-of-disk-hashtable
            var indexHeader = new HashtableIndexFileHeader
                              (
                dataFileFullName,
                dataFileEncoding,
                hashTableSize,
                HashtableIndexFileHeader.SlotType.Int32
                              );

            //create index-file
            using (var indexFileStream = IndexFileHelper.CreateFileStreamReadWrite(indexHeader.GetIndexFileFullName()))
                using (var diskSlotInt32ReadBuffer = new ReadBuffer <DiskSlotInt32>())
                    using (var diskTagInt32ReadBuffer = new ReadBuffer <DiskTagInt32>())
                    {
                        //Write header in index file
                        indexFileStream.SeekFromBegin(0);
                        indexHeader.SerializeIndexHeader(indexFileStream);

                        Int64 tagAreaOffset = (indexHeader.SizeOf + indexHeader.HashtableSize * DiskSlotInt32.SizeOf);

                        //Guard from tagAreaOffset > MemorySlotInt32.MaxValue
                        if (tagAreaOffset > MemorySlotInt32.MaxValue)
                        {
                            throw (new InvalidOperationException("tag area offset more then allow possible value => tag area offset: " + tagAreaOffset + ", possible value: " + MemorySlotInt32.MaxValue));
                        }

                        //fill disk-hashtable empty slot's
                        byte[] emptyDiskSlotBytes = DiskSlotInt32.GetEmptyDiskSlot().StructureToByteArray();
                        for (var i = 0U; i < hashTableSize; i++)
                        {
                            indexFileStream.WriteBytes(emptyDiskSlotBytes);
                        }

                        DiskTagInt32 diskTag = new DiskTagInt32();

                        //open data-file
                        using (var dataFileTextLineReader = new TextLineReader(indexHeader.DataFileFullName, indexHeader.DataFileEncoding))
                        {
                            #region [.build index on disk.]
                            int dataRecordCount     = 0;
                            int dataRecordMaxLenght = 0;
                            while (!dataFileTextLineReader.EndOfStream)
                            {
                                Int64 _posInt64 = dataFileTextLineReader.StreamPosition;

                                //Guard from file-pointer > MemorySlotInt32.MaxValue
                                if (_posInt64 > MemorySlotInt32.MaxValue)
                                {
                                    throw (new InvalidOperationException("file-pointer more then allow possible value => file-pointer: " + _posInt64 + ", possible value: " + MemorySlotInt32.MaxValue));
                                }

                                Int32 position = (Int32)_posInt64;

                                var text = dataFileTextLineReader.ReadLine4Indexing();
                                if (text.IsEmptyOrNull())
                                {
                                    continue;
                                }
                                //Normlize text if allowed
                                if (normlizeTextFunction != null)
                                {
                                    text = normlizeTextFunction(text);

                                    if (text.IsEmptyOrNull())
                                    {
                                        continue;
                                    }
                                }

                                uint hashCode = IndexFileHelper.HashFunction(text, indexHeader.HashtableSize);

                                #region [.code.]
                                DiskSlotInt32 diskSlot = indexFileStream.SeekAndReadDiskSlotInt32
                                                         (
                                    indexHeader.SizeOf + hashCode * DiskSlotInt32.SizeOf,
                                    diskSlotInt32ReadBuffer
                                                         );

                                //Free slot => search text not found
                                if (diskSlot.PositionInDataFile == Consts.FREE_INT32)
                                {
                                    //Calc data-slot properties
                                    diskSlot.PositionInDataFile = position;
                                    diskSlot.FirstTagOffset     = 0;

                                    //Write in index-file data-slot
                                    indexFileStream.SeekFromCurrentAndWrite
                                    (
                                        -diskSlotInt32ReadBuffer.Size,
                                        diskSlot.StructureToByteArray()
                                    );
                                }
                                else
                                {
                                    if (diskSlot.FirstTagOffset == 0)
                                    {
                                        diskSlot.FirstTagOffset = (Int32)tagAreaOffset;

                                        //ReWrite in index-file data-slot
                                        indexFileStream.SeekFromCurrentAndWrite
                                        (
                                            -diskSlotInt32ReadBuffer.Size,
                                            diskSlot.StructureToByteArray()
                                        );
                                    }
                                    else //if ( diskSlot.FirstTagOffset != 0 )
                                    {
                                        //Search in hashtable tag chain's
                                        diskTag.NextTagOffset = diskSlot.FirstTagOffset;
                                        while (0 < diskTag.NextTagOffset)
                                        {
                                            diskTag = indexFileStream.SeekAndReadDiskTagInt32(diskTag.NextTagOffset, diskTagInt32ReadBuffer);
                                        }

                                        //Calc disk-tag properties
                                        diskTag.PositionInDataFile = position;
                                        diskTag.NextTagOffset      = (Int32)tagAreaOffset;

                                        //ReWrite previous tag in chain
                                        indexFileStream.SeekFromCurrentAndWrite
                                        (
                                            -diskTagInt32ReadBuffer.Size,
                                            diskTag.StructureToByteArray()
                                        );
                                    }

                                    //Calc disk-tag properties
                                    diskTag.PositionInDataFile = position;
                                    diskTag.NextTagOffset      = 0;

                                    //Write in index-file
                                    indexFileStream.SeekAndWrite
                                    (
                                        tagAreaOffset,
                                        diskTag.StructureToByteArray()
                                    );

                                    //Calc disk-tag offset
                                    tagAreaOffset += DiskTagInt32.SizeOf;

                                    //Guard from tagAreaOffset > MemorySlotInt32.MaxValue
                                    if (tagAreaOffset > MemorySlotInt32.MaxValue)
                                    {
                                        throw (new InvalidOperationException("tag area offset more then allow possible value => tag area offset: " + tagAreaOffset + ", possible value: " + MemorySlotInt32.MaxValue));
                                    }
                                }
                                #endregion

                                dataRecordCount++;

                                if (dataRecordMaxLenght < text.Length)
                                {
                                    dataRecordMaxLenght = text.Length;                              //dataRecordMaxLenght = Math.Max( dataRecordMaxLenght, text.Length );
                                }
                            }

                            indexHeader.SetDataRecordCount(dataRecordCount);
                            indexHeader.SetDataRecordMaxBytesLenght(indexHeader.DataFileEncoding.GetMaxByteCount(dataRecordMaxLenght));
                            #endregion
                        }

                        //one more time - Write header in index file
                        indexFileStream.SeekFromBegin(0);
                        indexHeader.SerializeIndexHeader(indexFileStream);
                    }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Int32 file-pointer version
        /// </summary>
        /// <param name="searchText"></param>
        /// <returns></returns>
        private bool IsExistsDiskRoutineIn32(string searchText)
        {
            //Normlize text if allowed
            if (_NormlizeTextFunction != null)
            {
                searchText = _NormlizeTextFunction(searchText);
            }

            //Calculate hash-code
            uint hashCode = IndexFileHelper.HashFunction(searchText, this.IndexHeader.HashtableSize);


            //Search in hashtable
            //Read structure [DiskHtRecord_t] from index file
            DiskSlotInt32 diskSlot = _IndexFileBinaryReader.SeekAndReadDiskSlotInt32
                                     (
                this.IndexHeader.SizeOf + hashCode * DiskSlotInt32.SizeOf,
                _DiskSlotInt32ReadBuffer
                                     );

            //Free slot => search text not found
            if (diskSlot.PositionInDataFile == Consts.FREE_INT32)
            {
                // == NOT FOUND
                return(false);  /*"ни хуяшки нету :("*/
            }

            //Set file-pointer in position into data-file
            //Read text from data-file
            var text = _DataFileTextLineReader.SeekAndRead4Searching(diskSlot.PositionInDataFile);

            //Normlize text if allowed
            if (_NormlizeTextFunction != null)
            {
                text = _NormlizeTextFunction(text);
            }
            //Input text equal text in data-file => search text found (with first step)
            if (text == searchText)
            {
                // == FOUND ! first step
                return(true);
            }


            //Search in hashtable tag chain's
            DiskTagInt32 diskTag = new DiskTagInt32()
            {
                NextTagOffset = diskSlot.FirstTagOffset
            };

            while (0 < diskTag.NextTagOffset)
            {
                diskTag = _IndexFileBinaryReader.SeekAndReadDiskTagInt32(diskTag.NextTagOffset, _DiskTagInt32ReadBuffer);

                text = _DataFileTextLineReader.SeekAndRead4Searching(diskTag.PositionInDataFile);
                //Normlize text if allowed
                if (_NormlizeTextFunction != null)
                {
                    text = _NormlizeTextFunction(text);
                }
                if (text == searchText)
                {
                    // == FOUND !
                    return(true);
                }
            }

            // == NOT FOUND
            return(false);
        }
        /// <summary>
        /// Int32 file-pointer version
        /// </summary>
        /// <param name="dataFileFullName"></param>
        /// <param name="dataFileEncoding"></param>
        /// <param name="hashTableSize"></param>
        private static void BuildIndexInt32InMemory(string dataFileFullName, Encoding dataFileEncoding, uint hashTableSize, NormlizeTextFunction normlizeTextFunction)
        {
            //
            var indexHeader = new HashtableIndexFileHeader
                              (
                dataFileFullName,
                dataFileEncoding,
                hashTableSize,
                HashtableIndexFileHeader.SlotType.Int32
                              );
            //
            var hashTable = new MemorySlotInt32[indexHeader.HashtableSize];

            //
            using (var dataFileTextLineReader = new TextLineReader(indexHeader.DataFileFullName, indexHeader.DataFileEncoding))
            {
                #region [.2 calc index in memory.]
                int dataRecordCount     = 0;
                int dataRecordMaxLenght = 0;
                while (!dataFileTextLineReader.EndOfStream)
                {
                    Int64 _posInt64 = dataFileTextLineReader.StreamPosition;

                    //Guard from file-pointer > MemorySlotInt32.MaxValue
                    if (_posInt64 > MemorySlotInt32.MaxValue)
                    {
                        throw (new InvalidOperationException("file-pointer more then allow possible value => file-pointer: " + _posInt64 + ", possible value: " + MemorySlotInt32.MaxValue));
                    }

                    Int32 position = (Int32)_posInt64;

                    var text = dataFileTextLineReader.ReadLine4Indexing();
                    if (text.IsEmptyOrNull())
                    {
                        continue;
                    }
                    //Normlize text if allowed
                    if (normlizeTextFunction != null)
                    {
                        text = normlizeTextFunction(text);

                        if (text.IsEmptyOrNull())
                        {
                            continue;
                        }
                    }

                    uint hashCode = IndexFileHelper.HashFunction(text, indexHeader.HashtableSize);

                    MemorySlotInt32 memorySlot = hashTable[hashCode];
                    if (memorySlot == null)
                    {
                        hashTable[hashCode] = new MemorySlotInt32(position);
                    }
                    else
                    {
                        MemoryTagInt32 newMemoryTag = new MemoryTagInt32(position);

                        if (memorySlot.FirstMemoryTag == null)
                        {
                            memorySlot.FirstMemoryTag = newMemoryTag;
                        }
                        else
                        {
                            MemoryTagInt32 memoryTag = memorySlot.FirstMemoryTag;
                            while (memoryTag.NextMemoryTag != null)
                            {
                                memoryTag = memoryTag.NextMemoryTag;
                            }
                            memoryTag.NextMemoryTag = newMemoryTag;
                        }
                    }

                    dataRecordCount++;

                    if (dataRecordMaxLenght < text.Length)
                    {
                        dataRecordMaxLenght = text.Length;                                      //dataRecordMaxLenght = Math.Max( dataRecordMaxLenght, text.Length );
                    }
                }

                indexHeader.SetDataRecordCount(dataRecordCount);
                indexHeader.SetDataRecordMaxBytesLenght(indexHeader.DataFileEncoding.GetMaxByteCount(dataRecordMaxLenght));
                #endregion
            }

            #region [.Calulate Tag collision statistica.]
            foreach (var memorySlot in hashTable)
            {
                if (memorySlot == null)
                {
                    //empty slot's in hash-table
                    indexHeader.TagCollisionStatistica.IncremetByKey(-1);
                }
                else
                {
                    //Current tag chain depth
                    int currentTagChainDepth = 0;

                    //Get first memory-tag
                    MemoryTagInt32 memoryTag = memorySlot.FirstMemoryTag;
                    while (memoryTag != null)
                    {
                        //Get next tag
                        memoryTag = memoryTag.NextMemoryTag;

                        //Current tag chain depth
                        currentTagChainDepth++;
                    }

                    //
                    indexHeader.TagCollisionStatistica.IncremetByKey(currentTagChainDepth);
                }
            }
            #endregion

            //
            using (var indexFileBinaryWriter = IndexFileHelper.CreateBinaryWriterRandomAccess(indexHeader.GetIndexFileFullName()))
            {
                #region [.3 write hash table on disk.]

                var    diskSlot           = new DiskSlotInt32();
                var    diskTag            = new DiskTagInt32();
                byte[] emptyDiskSlotBytes = DiskSlotInt32.GetEmptyDiskSlot().StructureToByteArray();

                //Write header in index file
                indexFileBinaryWriter.SeekFromBegin(0);
                indexHeader.SerializeIndexHeader(indexFileBinaryWriter);

                Int64 tagAreaOffset = (indexHeader.SizeOf + indexHeader.HashtableSize * DiskSlotInt32.SizeOf);

                //Guard from tagAreaOffset > MemorySlotInt32.MaxValue
                if (tagAreaOffset > MemorySlotInt32.MaxValue)
                {
                    throw (new InvalidOperationException("tag area offset more then allow possible value => tag area offset: " + tagAreaOffset + ", possible value: " + MemorySlotInt32.MaxValue));
                }

                #region [.iterate over memory-slot's.]
                uint hashTableItemIndex = 0;
                foreach (var memorySlot in hashTable)
                {
                    //Empty hash-table slot
                    if (memorySlot == null)
                    {
                        //Write in index-file empty slot
                        indexFileBinaryWriter.SeekAndWrite
                        (
                            indexHeader.SizeOf + hashTableItemIndex * DiskSlotInt32.SizeOf,
                            emptyDiskSlotBytes
                        );
                    }
                    //Have a data hash-table slot
                    else
                    {
                        //Calc data-slot properties
                        diskSlot.PositionInDataFile = memorySlot.PositionInDataFile;
                        diskSlot.FirstTagOffset     = (memorySlot.FirstMemoryTag != null) ? (Int32)tagAreaOffset : 0;

                        //Write in index-file data-slot
                        indexFileBinaryWriter.SeekAndWrite
                        (
                            indexHeader.SizeOf + hashTableItemIndex * DiskSlotInt32.SizeOf,
                            diskSlot.StructureToByteArray()
                        );

                        //Calc Tag collision statistica
                        int currentTagChainDepth = 0;

                        //Get first memory-tag
                        MemoryTagInt32 memoryTag = memorySlot.FirstMemoryTag;
                        while (memoryTag != null)
                        {
                            //Calc disk-tag properties
                            diskTag.PositionInDataFile = memoryTag.PositionInDataFile;
                            diskTag.NextTagOffset      = (memoryTag.NextMemoryTag != null) ? (Int32)tagAreaOffset + DiskTagInt32.SizeOf : 0;

                            //Write in index-file
                            indexFileBinaryWriter.SeekAndWrite
                            (
                                tagAreaOffset,
                                diskTag.StructureToByteArray()
                            );
                            //Calc disk-tag offset
                            tagAreaOffset += DiskTagInt32.SizeOf;

                            //Guard from tagAreaOffset > MemorySlotInt32.MaxValue
                            if (tagAreaOffset > MemorySlotInt32.MaxValue)
                            {
                                throw (new InvalidOperationException("tag area offset more then allow possible value => tag area offset: " + tagAreaOffset + ", possible value: " + MemorySlotInt32.MaxValue));
                            }

                            //Get next tag
                            memoryTag = memoryTag.NextMemoryTag;

                            //Calc Tag collision statistica
                            currentTagChainDepth++;
                        }

                        /*
                         * //Calulate Tag collision statistica
                         * //---indexHeader.TagCollisionStatistica.IncremetByKey( currentTagChainDepth /*memoryHash.GetTagChainDepth()* / );
                         */
                    }

                    hashTableItemIndex++;
                }
                #endregion

                #endregion
            }

            //Free memory
            hashTable = null;
            GC.Collect();
        }