// 32-bit optimized version of above private static int FindMatchLength32(byte[] s1, int s1Index, int s2Index, int s2Limit) { Debug.Assert(s2Limit >= s2Index); var matched = 0; while (s2Index <= s2Limit - 4) { var a = Utilities.GetFourBytes(s1, s2Index); var b = Utilities.GetFourBytes(s1, s1Index + matched); if (a == b) { s2Index += 4; matched += 4; } else { var c = a ^ b; var matchingBits = (int)Utilities.NumberOfTrailingZeros(c); matched += matchingBits >> 3; return(matched); } } while (s2Index < s2Limit) { if (s1[s1Index + matched] == s1[s2Index]) { ++s2Index; ++matched; } else { return(matched); } } return(matched); }
internal int CompressFragment(byte[] input, int inputOffset, int inputSize, byte[] output, int outputIndex, short[] hashTable) { // "ip" is the input pointer, and "op" is the output pointer. var inputIndex = inputOffset; Debug.Assert(inputSize <= BlockSize); Debug.Assert((hashTable.Length & (hashTable.Length - 1)) == 0, "hashTable size must be a power of 2"); var shift = (int)(32 - Utilities.Log2Floor((uint)hashTable.Length)); //DCHECK_EQ(static_cast<int>(kuint32max >> shift), table_size - 1); var inputEnd = inputOffset + inputSize; var baseInputIndex = inputIndex; // Bytes in [next_emit, ip) will be emitted as literal bytes. Or // [next_emit, ip_end) after the main loop. var nextEmitIndex = inputIndex; if (inputSize >= InputMarginBytes) { var ipLimit = inputOffset + inputSize - InputMarginBytes; var currentIndexBytes = Utilities.GetFourBytes(input, ++inputIndex); for (var nextHash = Hash(currentIndexBytes, shift);;) { Debug.Assert(nextEmitIndex < inputIndex); // The body of this loop calls EmitLiteral once and then EmitCopy one or // more times. (The exception is that when we're close to exhausting // the input we goto emit_remainder.) // // In the first iteration of this loop we're just starting, so // there's nothing to copy, so calling EmitLiteral once is // necessary. And we only start a new iteration when the // current iteration has determined that a call to EmitLiteral will // precede the next call to EmitCopy (if any). // // Step 1: Scan forward in the input looking for a 4-byte-long match. // If we get close to exhausting the input then goto emit_remainder. // // Heuristic match skipping: If 32 bytes are scanned with no matches // found, start looking only at every other byte. If 32 more bytes are // scanned, look at every third byte, etc.. When a match is found, // immediately go back to looking at every byte. This is a small loss // (~5% performance, ~0.1% density) for compressible data due to more // bookkeeping, but for non-compressible data (such as JPEG) it's a huge // win since the compressor quickly "realizes" the data is incompressible // and doesn't bother looking for matches everywhere. // // The "skip" variable keeps track of how many bytes there are since the // last match; dividing it by 32 (ie. right-shifting by five) gives the // number of bytes to move ahead for each iteration. uint skip = 32; var nextIp = inputIndex; int candidate; do { inputIndex = nextIp; var hash = nextHash; Debug.Assert(hash == Hash(Utilities.GetFourBytes(input, inputIndex), shift)); nextIp = (int)(inputIndex + (skip++ >> 5)); if (nextIp > ipLimit) { goto emit_remainder; } currentIndexBytes = Utilities.GetFourBytes(input, nextIp); nextHash = Hash(currentIndexBytes, shift); candidate = baseInputIndex + hashTable[hash]; Debug.Assert(candidate >= baseInputIndex); Debug.Assert(candidate < inputIndex); hashTable[hash] = (short)(inputIndex - baseInputIndex); } while (Utilities.GetFourBytes(input, inputIndex) != Utilities.GetFourBytes(input, candidate)); // Step 2: A 4-byte match has been found. We'll later see if more // than 4 bytes match. But, prior to the match, input // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes." Debug.Assert(nextEmitIndex + 16 < inputEnd); outputIndex = EmitLiteral(output, outputIndex, input, nextEmitIndex, inputIndex - nextEmitIndex, true); // Step 3: Call EmitCopy, and then see if another EmitCopy could // be our next move. Repeat until we find no match for the // input immediately after what was consumed by the last EmitCopy call. // // If we exit this loop normally then we need to call EmitLiteral next, // though we don't yet know how big the literal will be. We handle that // by proceeding to the next iteration of the main loop. We also can exit // this loop via goto if we get close to exhausting the input. uint candidateBytes; int insertTail; do { // We have a 4-byte match at ip, and no need to emit any // "literal bytes" prior to ip. var baseIndex = inputIndex; var matched = 4 + _findMatchLength(input, candidate + 4, inputIndex + 4, inputEnd); inputIndex += matched; var offset = baseIndex - candidate; //DCHECK_EQ(0, memcmp(baseIndex, candidate, matched)); outputIndex = EmitCopy(output, outputIndex, offset, matched); // We could immediately start working at ip now, but to improve // compression we first update table[Hash(ip - 1, ...)]. insertTail = inputIndex - 1; nextEmitIndex = inputIndex; if (inputIndex >= ipLimit) { goto emit_remainder; } var prevHash = Hash(Utilities.GetFourBytes(input, insertTail), shift); hashTable[prevHash] = (short)(inputIndex - baseInputIndex - 1); var curHash = Hash(Utilities.GetFourBytes(input, insertTail + 1), shift); candidate = baseInputIndex + hashTable[curHash]; candidateBytes = Utilities.GetFourBytes(input, candidate); hashTable[curHash] = (short)(inputIndex - baseInputIndex); } while (Utilities.GetFourBytes(input, insertTail + 1) == candidateBytes); nextHash = Hash(Utilities.GetFourBytes(input, insertTail + 2), shift); ++inputIndex; } } emit_remainder: // Emit the remaining bytes as a literal if (nextEmitIndex < inputEnd) { outputIndex = EmitLiteral(output, outputIndex, input, nextEmitIndex, inputEnd - nextEmitIndex, false); } return(outputIndex); }