/// <summary>
        /// Performs a search for a single word in the index
        /// </summary>
        /// <param name="word">word to search</param>
        /// <param name="MaxHits">maximal hits to return</param>
        /// <param name="partialMatches">true if partial word should be matched also
        /// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
        /// <param name="titleOnly">true if only search in titles</param>
        /// <returns>Returns true if succeeded</returns>
        private bool SearchSingleWord(string word, int MaxHits, bool partialMatches, bool titleOnly)
        {
            string wordLower = word.ToLower();

            MemoryStream memStream = new MemoryStream(_binaryFileData);
            BinaryReader binReader = new BinaryReader(memStream);

            // seek to root node
            binReader.BaseStream.Seek(_header.RootOffset, SeekOrigin.Begin);

            if (_header.Depth > 2)
            {
                // unsupported index depth
                Trace.WriteLine("FullTextSearcher.SearchSingleWord() - Failed with message: Unsupported index depth !");
                Trace.WriteLine("File: " + _associatedFile.ChmFilePath);
                Trace.WriteLine(" ");
                return(false);
            }

            if (_header.Depth > 1)
            {
                // seek to the right leaf node ( if depth == 1, we are at the leaf node)
                int freeSpace = binReader.ReadInt16();

                for (int i = 0; i < _header.PageCount; ++i)
                {
                    // exstract index entries
                    int nWLength   = (int)binReader.ReadByte();
                    int nCPosition = (int)binReader.ReadByte();

                    string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength - 1, 0, true, _header.TextEncoder);

                    int nLeafOffset = binReader.ReadInt32();
                    binReader.ReadInt16();                     // unknown

                    if (sName.CompareTo(wordLower) >= 0)
                    {
                        // store current position
                        long curPos = binReader.BaseStream.Position;

                        // seek to leaf offset
                        binReader.BaseStream.Seek(nLeafOffset, SeekOrigin.Begin);

                        // read leafnode
                        ReadLeafNode(ref binReader, word, MaxHits, partialMatches, titleOnly);

                        // return to current position and continue reading index nodes
                        binReader.BaseStream.Seek(curPos, SeekOrigin.Begin);
                    }
                }
            }

            return(true);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Decodes a string block
        /// </summary>
        /// <param name="stringBlock">byte array which represents the string block</param>
        /// <param name="nStringOffset">current string offset number</param>
        /// <param name="nSubsetOffset">reference to a subset variable</param>
        /// <returns>true if succeeded</returns>
        /// <remarks>If a string crosses the end of a block then it will be cut off
        /// without a NT and repeated in full, with a NT, at the start of the next block.
        /// For eg "To customize the appearance of a contents file" might become
        /// "To customize the (block ending)To customize the appearance of a contents file"
        /// when there are 17 bytes left at the end of the block. </remarks>
        private bool DecodeBlock(byte[] stringBlock, ref int nStringOffset, ref int nSubsetOffset)
        {
            bool bRet = true;

            MemoryStream memStream = new MemoryStream(stringBlock);
            BinaryReader binReader = new BinaryReader(memStream);

            while ((memStream.Position < memStream.Length) && (bRet))
            {
                bool bFoundTerminator = false;

                int nCurOffset = nStringOffset + (int)memStream.Position;

                string sTemp = BinaryReaderHelp.ExtractString(ref binReader, ref bFoundTerminator, 0, true, _associatedFile.TextEncoding);

                if (nSubsetOffset != 0)
                {
                    _stringDictionary[nSubsetOffset.ToString()] = sTemp.ToString();
                }
                else
                {
                    _stringDictionary[nCurOffset.ToString()] = sTemp.ToString();
                }

                if (bFoundTerminator)
                {
                    nSubsetOffset = 0;
                }
                else
                {
                    nSubsetOffset = nCurOffset;
                }
            }

            return(bRet);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Decodes a block of url-string data
        /// </summary>
        /// <param name="dataBlock">block of data</param>
        /// <param name="nOffset">current file offset</param>
        /// <param name="indexBlocks">number of index blocks</param>
        /// <returns>true if succeeded</returns>
        private bool DecodeBlock(byte[] dataBlock, ref int nOffset, int indexBlocks)
        {
            bool bRet         = true;
            int  nblockOffset = nOffset;

            MemoryStream memStream = new MemoryStream(dataBlock);
            BinaryReader binReader = new BinaryReader(memStream);

            int freeSpace   = binReader.ReadInt16();           // length of freespace
            int nrOfEntries = binReader.ReadInt16();           // number of entries

            bool bListingEndReached = false;

            //while( (memStream.Position < (memStream.Length-freeSpace)) && (bRet) )
            //{
            int nIndexOfPrevBlock  = -1;
            int nIndexOfNextBlock  = -1;
            int nIndexOfChildBlock = 0;

            if (_readListingBlocks)
            {
                nIndexOfPrevBlock = binReader.ReadInt32();                         // -1 if this is the header
                nIndexOfNextBlock = binReader.ReadInt32();                         // -1 if this is the last block
            }
            else
            {
                nIndexOfChildBlock = binReader.ReadInt32();
            }

            for (int nE = 0; nE < nrOfEntries; nE++)
            {
                if (_readListingBlocks)
                {
                    bListingEndReached = (nIndexOfNextBlock == -1);

                    string keyWord = BinaryReaderHelp.ExtractUTF16String(ref binReader, 0, true, _associatedFile.TextEncoding);

                    bool isSeeAlsoKeyword = (binReader.ReadInt16() != 0);

                    int indent     = binReader.ReadInt16();                         // indent of entry
                    int nCharIndex = binReader.ReadInt32();

                    binReader.ReadInt32();

                    int numberOfPairs = binReader.ReadInt32();

                    int[]    nTopics = new int[numberOfPairs];
                    string[] seeAlso = new string[numberOfPairs];

                    for (int i = 0; i < numberOfPairs; i++)
                    {
                        if (isSeeAlsoKeyword)
                        {
                            seeAlso[i] = HttpUtility.HtmlDecode(BinaryReaderHelp.ExtractUTF16String(ref binReader, 0, true, _associatedFile.TextEncoding));
                        }
                        else
                        {
                            nTopics[i] = binReader.ReadInt32();
                        }
                    }

                    binReader.ReadInt32();                             // unknown

                    int nIndexOfThisEntry = binReader.ReadInt32();

                    IndexItem newItem = new IndexItem(_associatedFile, keyWord, isSeeAlsoKeyword, indent, nCharIndex, nIndexOfThisEntry, seeAlso, nTopics);
                    _indexList.Add(newItem);
                }
                else
                {
                    string keyWord = BinaryReaderHelp.ExtractUTF16String(ref binReader, 0, true, _associatedFile.TextEncoding);

                    bool isSeeAlsoKeyword = (binReader.ReadInt16() != 0);

                    int indent     = binReader.ReadInt16();                         // indent of entry
                    int nCharIndex = binReader.ReadInt32();

                    binReader.ReadInt32();

                    int numberOfPairs = binReader.ReadInt32();

                    int[]    nTopics = new int[numberOfPairs];
                    string[] seeAlso = new string[numberOfPairs];

                    for (int i = 0; i < numberOfPairs; i++)
                    {
                        if (isSeeAlsoKeyword)
                        {
                            seeAlso[i] = BinaryReaderHelp.ExtractUTF16String(ref binReader, 0, true, _associatedFile.TextEncoding);
                        }
                        else
                        {
                            nTopics[i] = binReader.ReadInt32();
                        }
                    }

                    int nIndexChild       = binReader.ReadInt32();
                    int nIndexOfThisEntry = -1;

                    IndexItem newItem = new IndexItem(_associatedFile, keyWord, isSeeAlsoKeyword, indent, nCharIndex, nIndexOfThisEntry, seeAlso, nTopics);
                    _indexList.Add(newItem);
                }
            }
            //}

            binReader.ReadBytes(freeSpace);


            if (bListingEndReached)
            {
                _readListingBlocks = false;
            }

            return(bRet);
        }
        /// <summary>
        /// Decodes the s/r encoded WordCodeList (=wcl) and creates hit entries
        /// </summary>
        /// <param name="wclBytes">wcl encoded byte array</param>
        /// <param name="MaxHits">maximal hits</param>
        /// <param name="word">the word to find</param>
        private void DecodeWCL(byte[] wclBytes, int MaxHits, string word)
        {
            byte[] wclBits = new byte[wclBytes.Length * 8];

            int nBitIdx = 0;

            for (int i = 0; i < wclBytes.Length; i++)
            {
                for (int j = 0; j < 8; j++)
                {
                    wclBits[nBitIdx] = ((byte)(wclBytes[i] & ((byte)((byte)0x1 << (7 - j))))) > (byte)0 ? (byte)1 : (byte)0;
                    nBitIdx++;
                }
            }

            nBitIdx = 0;

            int nDocIdx = 0;             // delta encoded

            while (nBitIdx < wclBits.Length)
            {
                nDocIdx += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleDocumentIndex, _header.RootDocumentIndex, ref nBitIdx);
                int nCodeCnt = BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleCodeCount, _header.RootCodeCount, ref nBitIdx);

                int nWordLocation = 0;                 // delta encoded

                for (int locidx = 0; locidx < nCodeCnt; locidx++)
                {
                    nWordLocation += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleLocationCodes, _header.RootLocationCodes, ref nBitIdx);
                }
                // apply padding
                while ((nBitIdx % 8) != 0)
                {
                    nBitIdx++;
                }

                // Record hit
                HitHelper hitObj = DocumentHit(nDocIdx);

                if (hitObj == null)
                {
                    if (_hitsHelper.Count > MaxHits)
                    {
                        return;
                    }

                    hitObj = new HitHelper(nDocIdx, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Title,
                                           ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Locale, _associatedFile.CompileFile,
                                           ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).URL, 0.0);

                    for (int k = 0; k < nCodeCnt; k++)
                    {
                        hitObj.UpdateRating(word);
                    }

                    _hitsHelper.Add(hitObj);
                }
                else
                {
                    for (int k = 0; k < nCodeCnt; k++)
                    {
                        hitObj.UpdateRating(word);
                    }
                }
            }
        }
        /// <summary>
        /// Reads a leaf node and extracts documents which holds the searched word
        /// </summary>
        /// <param name="binReader">reference to the reader</param>
        /// <param name="word">word to search</param>
        /// <param name="MaxHits">maximal hits to return</param>
        /// <param name="partialMatches">true if partial word should be matched also
        /// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
        /// <param name="titleOnly">true if only search in titles</param>
        private void ReadLeafNode(ref BinaryReader binReader, string word, int MaxHits, bool partialMatches, bool titleOnly)
        {
            int nNextPageOffset = binReader.ReadInt32();

            binReader.ReadInt16();             // unknown
            int    lfreeSpace  = binReader.ReadInt16();
            string curFullWord = "";
            bool   bFound      = false;
            string wordLower   = word.ToLower();

            for (;;)
            {
                if (binReader.BaseStream.Position >= binReader.BaseStream.Length)
                {
                    break;
                }

                int nWLength = (int)binReader.ReadByte();

                if (nWLength == 0)
                {
                    break;
                }

                int nCPosition = (int)binReader.ReadByte();

                string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength - 1, 0, true, _header.TextEncoder);

                int Context = (int)binReader.ReadByte();                 // 0...body tag, 1...title tag, others unknown

                long nrOfWCL   = BinaryReaderHelp.ReadENCINT(ref binReader);
                int  wclOffset = binReader.ReadInt32();

                binReader.ReadInt16();                 // unknown

                long bytesOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader);

                if (nCPosition > 0)
                {
                    curFullWord = CombineStrings(curFullWord, sName, nCPosition);
                }
                else
                {
                    curFullWord = sName;
                }

                bFound = false;
                if (partialMatches)
                {
                    bFound = (curFullWord.IndexOf(wordLower) >= 0);
                }
                else
                {
                    bFound = (curFullWord == wordLower);
                }

                if (bFound)
                {
                    if ((titleOnly && (Context == 1)) || (!titleOnly))
                    {
                        // store actual offset
                        long curPos = binReader.BaseStream.Position;

                        // found the word, begin with WCL encoding
                        binReader.BaseStream.Seek(wclOffset, SeekOrigin.Begin);

                        byte[] wclBytes = binReader.ReadBytes((int)bytesOfWCL);

                        DecodeWCL(wclBytes, MaxHits, word);

                        // back and continue reading leafnodes
                        binReader.BaseStream.Seek(curPos, SeekOrigin.Begin);
                    }
                }
            }
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Decodes an #system file entry
        /// </summary>
        /// <param name="binReader">binary reader reference</param>
        /// <returns>true if succeeded</returns>
        private bool DecodeEntry(ref BinaryReader binReader)
        {
            bool bRet = true;

            int code   = (int)binReader.ReadInt16();            // entry code, WORD
            int length = (int)binReader.ReadInt16();            // length of entry

            switch (code)
            {
            case 0:
            {
                _contentsFile = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding);
            }; break;

            case 1:
            {
                _indexFile = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding);
            }; break;

            case 2:
            {
                _defaultTopic = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding);
            }; break;

            case 3:
            {
                _title = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding);
            }; break;

            case 4:
            {
                int nTemp = 0;
                nTemp    = binReader.ReadInt32();                      // read DWORD LCID
                _culture = new CultureInfo(nTemp);

                if (_culture != null)
                {
                    _associatedFile.TextEncoding = Encoding.GetEncoding(_culture.TextInfo.ANSICodePage);
                }

                nTemp = binReader.ReadInt32();                         // read DWORD DBCS
                _dbcs = (nTemp == 1);

                nTemp           = binReader.ReadInt32();               // read DWORD Fulltext search
                _fullTextSearch = (nTemp == 1);

                nTemp      = binReader.ReadInt32();                    // read DWORD has klinks
                _hasKLinks = (nTemp != 0);

                nTemp      = binReader.ReadInt32();                    // read DWORD has alinks
                _hasALinks = (nTemp != 0);

                // read the rest of code 4 (not important for us)
                byte[] temp = new byte[length - (5 * 4)];
                temp = binReader.ReadBytes(length - (5 * 4));
            }; break;

            case 5:
            {
                _defaultWindow = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding);
            }; break;

            case 6:
            {
                _compileFile = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding);
            }; break;

            case 7:
            {
                if (_fileVersion > 2)
                {
                    _binaryIndexURLTableID = (uint)binReader.ReadInt32();
                }
                else
                {
                    byte[] read = binReader.ReadBytes(length);
                    int    i    = read.Length;
                }
            }; break;

            case 8:
            {
                // abbreviation (not interresting for us)
                byte[] read = binReader.ReadBytes(length);
                int    i    = read.Length;
            }; break;

            case 9:
            {
                _compilerVersion = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding);
            }; break;

            case 10:
            {
                // timestamp of the file (not interresting for us)
                byte[] read = binReader.ReadBytes(length);
                int    i    = read.Length;
            }; break;

            case 11:
            {
                if (_fileVersion > 2)
                {
                    _binaryTOCURLTableID = (uint)binReader.ReadInt32();
                }
                else
                {
                    byte[] read = binReader.ReadBytes(length);
                    int    i    = read.Length;
                }
            }; break;

            case 12:
            {
                // number of information bytes
                byte[] read = binReader.ReadBytes(length);
                int    i    = read.Length;
            }; break;

            case 13:
            {
                // copy of file #idxhdr
                byte[] read = binReader.ReadBytes(length);
                int    i    = read.Length;
            }; break;

            case 14:
            {
                // custom tabs for HH viewer
                byte[] read = binReader.ReadBytes(length);
                int    i    = read.Length;
            }; break;

            case 15:
            {
                // a checksum
                byte[] read = binReader.ReadBytes(length);
                int    i    = read.Length;
            }; break;

            case 16:
            {
                // Default Font=string,number,number
                // The string is the name of the font, the first number is the
                // point size & the last number is the character set used by the font.
                // For acceptable values see *_CHARSET defines in wingdi.h from the
                // Windows SDK or the same file in MinGW or Wine.
                // Most of the time you will only want to use 0, which is the value for ANSI,
                // which is the subset of ASCII used by Windows.
                _defaultFont = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding);
            }; break;

            default:
            {
                byte[] temp = new byte[length];
                temp = binReader.ReadBytes(length);
                //bRet = false;
                int i = temp.Length;
            }; break;
            }

            return(bRet);
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Decodes a block of url-string data
        /// </summary>
        /// <param name="dataBlock">block of data</param>
        /// <param name="nOffset">current file offset</param>
        /// <returns>true if succeeded</returns>
        private bool DecodeBlock(byte[] dataBlock, ref int nOffset)
        {
            bool bRet        = true;
            int  blockOffset = nOffset;

            MemoryStream memStream = new MemoryStream(dataBlock);
            BinaryReader binReader = new BinaryReader(memStream);

            if (nOffset == 0)
            {
                binReader.ReadByte();                 // first block starts with an unknown byte
            }
            while ((memStream.Position < (memStream.Length - 8)) && (bRet))
            {
                int entryOffset = blockOffset + (int)memStream.Position;

                int urlOffset   = binReader.ReadInt32();
                int frameOffset = binReader.ReadInt32();


                // There is one way to tell where the end of the URL/FrameName
                // pairs occurs: Repeat the following: read 2 DWORDs and if both
                // are less than the current offset then this is the start of the Local
                // strings else skip two NT strings.
//				if(( (urlOffset < (entryOffset+8)) && (frameOffset < (entryOffset+8)) ))
//				{
//					//TODO: add correct string reading if an offset has been found
//					/*
//					int curOffset = (int)memStream.Position;
//
//					memStream.Seek( (long)(blockOffset-urlOffset), SeekOrigin.Begin);
//					string sTemp = CHMReader.ExtractString(ref binReader, 0, true);
//
//					memStream.Seek( (long)(blockOffset-frameOffset), SeekOrigin.Begin);
//					sTemp = CHMReader.ExtractString(ref binReader, 0, true);
//
//					memStream.Seek((long)curOffset, SeekOrigin.Begin);
//					*/
//
//
//					int curOffs = (int)memStream.Position;
//					BinaryReaderHelp.ExtractString(ref binReader, 0, true, _associatedFile.TextEncoding);
//					nOffset += (int)memStream.Position - curOffs;
//
//					curOffs = (int)memStream.Position;
//					BinaryReaderHelp.ExtractString(ref binReader, 0, true, _associatedFile.TextEncoding);
//					nOffset += (int)memStream.Position - curOffs;
//				}
//				else
                {
                    bool bFoundTerminator = false;

                    string sTemp = BinaryReaderHelp.ExtractString(ref binReader, ref bFoundTerminator, 0, true, _associatedFile.TextEncoding);

                    if (sTemp == "")
                    {
                        //nOffset = nOffset + 1 + (int)memStream.Length - (int)memStream.Position;
                        memStream.Seek(memStream.Length - 1, SeekOrigin.Begin);
                    }
                    else
                    {
                        _urlDictionary[entryOffset.ToString()]       = sTemp.ToString();
                        _framenameDictionary[entryOffset.ToString()] = sTemp.ToString();
                    }
                }
            }

            return(bRet);
        }