/// <summary> /// Performs a search for a single word in the index /// </summary> /// <param name="word">word to search</param> /// <param name="MaxHits">maximal hits to return</param> /// <param name="partialMatches">true if partial word should be matched also /// ( if this is true a search of 'support' will match 'supports', otherwise not )</param> /// <param name="titleOnly">true if only search in titles</param> /// <returns>Returns true if succeeded</returns> private bool SearchSingleWord(string word, int MaxHits, bool partialMatches, bool titleOnly) { string wordLower = word.ToLower(); MemoryStream memStream = new MemoryStream(_binaryFileData); BinaryReader binReader = new BinaryReader(memStream); // seek to root node binReader.BaseStream.Seek(_header.RootOffset, SeekOrigin.Begin); if (_header.Depth > 2) { // unsupported index depth Trace.WriteLine("FullTextSearcher.SearchSingleWord() - Failed with message: Unsupported index depth !"); Trace.WriteLine("File: " + _associatedFile.ChmFilePath); Trace.WriteLine(" "); return(false); } if (_header.Depth > 1) { // seek to the right leaf node ( if depth == 1, we are at the leaf node) int freeSpace = binReader.ReadInt16(); for (int i = 0; i < _header.PageCount; ++i) { // exstract index entries int nWLength = (int)binReader.ReadByte(); int nCPosition = (int)binReader.ReadByte(); string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength - 1, 0, true, _header.TextEncoder); int nLeafOffset = binReader.ReadInt32(); binReader.ReadInt16(); // unknown if (sName.CompareTo(wordLower) >= 0) { // store current position long curPos = binReader.BaseStream.Position; // seek to leaf offset binReader.BaseStream.Seek(nLeafOffset, SeekOrigin.Begin); // read leafnode ReadLeafNode(ref binReader, word, MaxHits, partialMatches, titleOnly); // return to current position and continue reading index nodes binReader.BaseStream.Seek(curPos, SeekOrigin.Begin); } } } return(true); }
/// <summary> /// Decodes a string block /// </summary> /// <param name="stringBlock">byte array which represents the string block</param> /// <param name="nStringOffset">current string offset number</param> /// <param name="nSubsetOffset">reference to a subset variable</param> /// <returns>true if succeeded</returns> /// <remarks>If a string crosses the end of a block then it will be cut off /// without a NT and repeated in full, with a NT, at the start of the next block. /// For eg "To customize the appearance of a contents file" might become /// "To customize the (block ending)To customize the appearance of a contents file" /// when there are 17 bytes left at the end of the block. </remarks> private bool DecodeBlock(byte[] stringBlock, ref int nStringOffset, ref int nSubsetOffset) { bool bRet = true; MemoryStream memStream = new MemoryStream(stringBlock); BinaryReader binReader = new BinaryReader(memStream); while ((memStream.Position < memStream.Length) && (bRet)) { bool bFoundTerminator = false; int nCurOffset = nStringOffset + (int)memStream.Position; string sTemp = BinaryReaderHelp.ExtractString(ref binReader, ref bFoundTerminator, 0, true, _associatedFile.TextEncoding); if (nSubsetOffset != 0) { _stringDictionary[nSubsetOffset.ToString()] = sTemp.ToString(); } else { _stringDictionary[nCurOffset.ToString()] = sTemp.ToString(); } if (bFoundTerminator) { nSubsetOffset = 0; } else { nSubsetOffset = nCurOffset; } } return(bRet); }
/// <summary> /// Decodes a block of url-string data /// </summary> /// <param name="dataBlock">block of data</param> /// <param name="nOffset">current file offset</param> /// <param name="indexBlocks">number of index blocks</param> /// <returns>true if succeeded</returns> private bool DecodeBlock(byte[] dataBlock, ref int nOffset, int indexBlocks) { bool bRet = true; int nblockOffset = nOffset; MemoryStream memStream = new MemoryStream(dataBlock); BinaryReader binReader = new BinaryReader(memStream); int freeSpace = binReader.ReadInt16(); // length of freespace int nrOfEntries = binReader.ReadInt16(); // number of entries bool bListingEndReached = false; //while( (memStream.Position < (memStream.Length-freeSpace)) && (bRet) ) //{ int nIndexOfPrevBlock = -1; int nIndexOfNextBlock = -1; int nIndexOfChildBlock = 0; if (_readListingBlocks) { nIndexOfPrevBlock = binReader.ReadInt32(); // -1 if this is the header nIndexOfNextBlock = binReader.ReadInt32(); // -1 if this is the last block } else { nIndexOfChildBlock = binReader.ReadInt32(); } for (int nE = 0; nE < nrOfEntries; nE++) { if (_readListingBlocks) { bListingEndReached = (nIndexOfNextBlock == -1); string keyWord = BinaryReaderHelp.ExtractUTF16String(ref binReader, 0, true, _associatedFile.TextEncoding); bool isSeeAlsoKeyword = (binReader.ReadInt16() != 0); int indent = binReader.ReadInt16(); // indent of entry int nCharIndex = binReader.ReadInt32(); binReader.ReadInt32(); int numberOfPairs = binReader.ReadInt32(); int[] nTopics = new int[numberOfPairs]; string[] seeAlso = new string[numberOfPairs]; for (int i = 0; i < numberOfPairs; i++) { if (isSeeAlsoKeyword) { seeAlso[i] = HttpUtility.HtmlDecode(BinaryReaderHelp.ExtractUTF16String(ref binReader, 0, true, _associatedFile.TextEncoding)); } else { nTopics[i] = binReader.ReadInt32(); } } binReader.ReadInt32(); // unknown int nIndexOfThisEntry = binReader.ReadInt32(); IndexItem newItem = new IndexItem(_associatedFile, keyWord, isSeeAlsoKeyword, indent, nCharIndex, nIndexOfThisEntry, seeAlso, nTopics); _indexList.Add(newItem); } else { string keyWord = BinaryReaderHelp.ExtractUTF16String(ref binReader, 0, true, _associatedFile.TextEncoding); bool isSeeAlsoKeyword = (binReader.ReadInt16() != 0); int indent = binReader.ReadInt16(); // indent of entry int nCharIndex = binReader.ReadInt32(); binReader.ReadInt32(); int numberOfPairs = binReader.ReadInt32(); int[] nTopics = new int[numberOfPairs]; string[] seeAlso = new string[numberOfPairs]; for (int i = 0; i < numberOfPairs; i++) { if (isSeeAlsoKeyword) { seeAlso[i] = BinaryReaderHelp.ExtractUTF16String(ref binReader, 0, true, _associatedFile.TextEncoding); } else { nTopics[i] = binReader.ReadInt32(); } } int nIndexChild = binReader.ReadInt32(); int nIndexOfThisEntry = -1; IndexItem newItem = new IndexItem(_associatedFile, keyWord, isSeeAlsoKeyword, indent, nCharIndex, nIndexOfThisEntry, seeAlso, nTopics); _indexList.Add(newItem); } } //} binReader.ReadBytes(freeSpace); if (bListingEndReached) { _readListingBlocks = false; } return(bRet); }
/// <summary> /// Decodes the s/r encoded WordCodeList (=wcl) and creates hit entries /// </summary> /// <param name="wclBytes">wcl encoded byte array</param> /// <param name="MaxHits">maximal hits</param> /// <param name="word">the word to find</param> private void DecodeWCL(byte[] wclBytes, int MaxHits, string word) { byte[] wclBits = new byte[wclBytes.Length * 8]; int nBitIdx = 0; for (int i = 0; i < wclBytes.Length; i++) { for (int j = 0; j < 8; j++) { wclBits[nBitIdx] = ((byte)(wclBytes[i] & ((byte)((byte)0x1 << (7 - j))))) > (byte)0 ? (byte)1 : (byte)0; nBitIdx++; } } nBitIdx = 0; int nDocIdx = 0; // delta encoded while (nBitIdx < wclBits.Length) { nDocIdx += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleDocumentIndex, _header.RootDocumentIndex, ref nBitIdx); int nCodeCnt = BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleCodeCount, _header.RootCodeCount, ref nBitIdx); int nWordLocation = 0; // delta encoded for (int locidx = 0; locidx < nCodeCnt; locidx++) { nWordLocation += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleLocationCodes, _header.RootLocationCodes, ref nBitIdx); } // apply padding while ((nBitIdx % 8) != 0) { nBitIdx++; } // Record hit HitHelper hitObj = DocumentHit(nDocIdx); if (hitObj == null) { if (_hitsHelper.Count > MaxHits) { return; } hitObj = new HitHelper(nDocIdx, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Title, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Locale, _associatedFile.CompileFile, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).URL, 0.0); for (int k = 0; k < nCodeCnt; k++) { hitObj.UpdateRating(word); } _hitsHelper.Add(hitObj); } else { for (int k = 0; k < nCodeCnt; k++) { hitObj.UpdateRating(word); } } } }
/// <summary> /// Reads a leaf node and extracts documents which holds the searched word /// </summary> /// <param name="binReader">reference to the reader</param> /// <param name="word">word to search</param> /// <param name="MaxHits">maximal hits to return</param> /// <param name="partialMatches">true if partial word should be matched also /// ( if this is true a search of 'support' will match 'supports', otherwise not )</param> /// <param name="titleOnly">true if only search in titles</param> private void ReadLeafNode(ref BinaryReader binReader, string word, int MaxHits, bool partialMatches, bool titleOnly) { int nNextPageOffset = binReader.ReadInt32(); binReader.ReadInt16(); // unknown int lfreeSpace = binReader.ReadInt16(); string curFullWord = ""; bool bFound = false; string wordLower = word.ToLower(); for (;;) { if (binReader.BaseStream.Position >= binReader.BaseStream.Length) { break; } int nWLength = (int)binReader.ReadByte(); if (nWLength == 0) { break; } int nCPosition = (int)binReader.ReadByte(); string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength - 1, 0, true, _header.TextEncoder); int Context = (int)binReader.ReadByte(); // 0...body tag, 1...title tag, others unknown long nrOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader); int wclOffset = binReader.ReadInt32(); binReader.ReadInt16(); // unknown long bytesOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader); if (nCPosition > 0) { curFullWord = CombineStrings(curFullWord, sName, nCPosition); } else { curFullWord = sName; } bFound = false; if (partialMatches) { bFound = (curFullWord.IndexOf(wordLower) >= 0); } else { bFound = (curFullWord == wordLower); } if (bFound) { if ((titleOnly && (Context == 1)) || (!titleOnly)) { // store actual offset long curPos = binReader.BaseStream.Position; // found the word, begin with WCL encoding binReader.BaseStream.Seek(wclOffset, SeekOrigin.Begin); byte[] wclBytes = binReader.ReadBytes((int)bytesOfWCL); DecodeWCL(wclBytes, MaxHits, word); // back and continue reading leafnodes binReader.BaseStream.Seek(curPos, SeekOrigin.Begin); } } } }
/// <summary> /// Decodes an #system file entry /// </summary> /// <param name="binReader">binary reader reference</param> /// <returns>true if succeeded</returns> private bool DecodeEntry(ref BinaryReader binReader) { bool bRet = true; int code = (int)binReader.ReadInt16(); // entry code, WORD int length = (int)binReader.ReadInt16(); // length of entry switch (code) { case 0: { _contentsFile = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding); }; break; case 1: { _indexFile = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding); }; break; case 2: { _defaultTopic = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding); }; break; case 3: { _title = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding); }; break; case 4: { int nTemp = 0; nTemp = binReader.ReadInt32(); // read DWORD LCID _culture = new CultureInfo(nTemp); if (_culture != null) { _associatedFile.TextEncoding = Encoding.GetEncoding(_culture.TextInfo.ANSICodePage); } nTemp = binReader.ReadInt32(); // read DWORD DBCS _dbcs = (nTemp == 1); nTemp = binReader.ReadInt32(); // read DWORD Fulltext search _fullTextSearch = (nTemp == 1); nTemp = binReader.ReadInt32(); // read DWORD has klinks _hasKLinks = (nTemp != 0); nTemp = binReader.ReadInt32(); // read DWORD has alinks _hasALinks = (nTemp != 0); // read the rest of code 4 (not important for us) byte[] temp = new byte[length - (5 * 4)]; temp = binReader.ReadBytes(length - (5 * 4)); }; break; case 5: { _defaultWindow = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding); }; break; case 6: { _compileFile = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding); }; break; case 7: { if (_fileVersion > 2) { _binaryIndexURLTableID = (uint)binReader.ReadInt32(); } else { byte[] read = binReader.ReadBytes(length); int i = read.Length; } }; break; case 8: { // abbreviation (not interresting for us) byte[] read = binReader.ReadBytes(length); int i = read.Length; }; break; case 9: { _compilerVersion = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding); }; break; case 10: { // timestamp of the file (not interresting for us) byte[] read = binReader.ReadBytes(length); int i = read.Length; }; break; case 11: { if (_fileVersion > 2) { _binaryTOCURLTableID = (uint)binReader.ReadInt32(); } else { byte[] read = binReader.ReadBytes(length); int i = read.Length; } }; break; case 12: { // number of information bytes byte[] read = binReader.ReadBytes(length); int i = read.Length; }; break; case 13: { // copy of file #idxhdr byte[] read = binReader.ReadBytes(length); int i = read.Length; }; break; case 14: { // custom tabs for HH viewer byte[] read = binReader.ReadBytes(length); int i = read.Length; }; break; case 15: { // a checksum byte[] read = binReader.ReadBytes(length); int i = read.Length; }; break; case 16: { // Default Font=string,number,number // The string is the name of the font, the first number is the // point size & the last number is the character set used by the font. // For acceptable values see *_CHARSET defines in wingdi.h from the // Windows SDK or the same file in MinGW or Wine. // Most of the time you will only want to use 0, which is the value for ANSI, // which is the subset of ASCII used by Windows. _defaultFont = BinaryReaderHelp.ExtractString(ref binReader, length, 0, true, _associatedFile.TextEncoding); }; break; default: { byte[] temp = new byte[length]; temp = binReader.ReadBytes(length); //bRet = false; int i = temp.Length; }; break; } return(bRet); }
/// <summary> /// Decodes a block of url-string data /// </summary> /// <param name="dataBlock">block of data</param> /// <param name="nOffset">current file offset</param> /// <returns>true if succeeded</returns> private bool DecodeBlock(byte[] dataBlock, ref int nOffset) { bool bRet = true; int blockOffset = nOffset; MemoryStream memStream = new MemoryStream(dataBlock); BinaryReader binReader = new BinaryReader(memStream); if (nOffset == 0) { binReader.ReadByte(); // first block starts with an unknown byte } while ((memStream.Position < (memStream.Length - 8)) && (bRet)) { int entryOffset = blockOffset + (int)memStream.Position; int urlOffset = binReader.ReadInt32(); int frameOffset = binReader.ReadInt32(); // There is one way to tell where the end of the URL/FrameName // pairs occurs: Repeat the following: read 2 DWORDs and if both // are less than the current offset then this is the start of the Local // strings else skip two NT strings. // if(( (urlOffset < (entryOffset+8)) && (frameOffset < (entryOffset+8)) )) // { // //TODO: add correct string reading if an offset has been found // /* // int curOffset = (int)memStream.Position; // // memStream.Seek( (long)(blockOffset-urlOffset), SeekOrigin.Begin); // string sTemp = CHMReader.ExtractString(ref binReader, 0, true); // // memStream.Seek( (long)(blockOffset-frameOffset), SeekOrigin.Begin); // sTemp = CHMReader.ExtractString(ref binReader, 0, true); // // memStream.Seek((long)curOffset, SeekOrigin.Begin); // */ // // // int curOffs = (int)memStream.Position; // BinaryReaderHelp.ExtractString(ref binReader, 0, true, _associatedFile.TextEncoding); // nOffset += (int)memStream.Position - curOffs; // // curOffs = (int)memStream.Position; // BinaryReaderHelp.ExtractString(ref binReader, 0, true, _associatedFile.TextEncoding); // nOffset += (int)memStream.Position - curOffs; // } // else { bool bFoundTerminator = false; string sTemp = BinaryReaderHelp.ExtractString(ref binReader, ref bFoundTerminator, 0, true, _associatedFile.TextEncoding); if (sTemp == "") { //nOffset = nOffset + 1 + (int)memStream.Length - (int)memStream.Position; memStream.Seek(memStream.Length - 1, SeekOrigin.Begin); } else { _urlDictionary[entryOffset.ToString()] = sTemp.ToString(); _framenameDictionary[entryOffset.ToString()] = sTemp.ToString(); } } } return(bRet); }