internal UidSpaceMapper(long strideLength, long newBaseUid, long oldBaseUid, long numUrls, string fileName) { this.oldBaseUid = oldBaseUid; this.newBaseUid = newBaseUid; this.numUrls = numUrls; this.strideLength = strideLength; using (var stream = new BufferedStream(new FileStream(fileName, FileMode.Open, FileAccess.Read))) { this.bytes = new CachedStream(stream, (ulong)stream.Length); this.deco = new VarNybbleIntStreamDecompressor(this.bytes); } // Construct the index by parsing this.bytes long numIdxItems = (numUrls / strideLength) + 1; this.idxPosition = new ulong[numIdxItems]; this.idxGapSum = new ulong[numIdxItems]; ulong gapSum = 0; int p = 0; for (long i = 0; i <= numUrls; i++) { if (i % strideLength == 0) { this.idxPosition[p] = deco.GetPosition(); this.idxGapSum[p] = gapSum; p++; } if (i < numUrls) gapSum += deco.GetUInt64(); } Contract.Assert(p == numIdxItems); Contract.Assert(deco.AtEnd()); }
private UInt64 word; // bytes 7 (highest-order byte) to lastContBit are payload #endregion Fields #region Constructors internal VarByteIntStreamDecompressor(CachedStream main) { this.main = main; this.pos = 0; UInt64 len = main.Size; if ((len % 8) != 0) { throw new FileFormatException("stream length (" + len + ") is not a multiple of 8"); } this.nextContBit = -1; this.lastContBit = 0; }
private UInt64 pos; // mutable state #endregion Fields #region Constructors internal DummyIntStreamDecompressor(CachedStream main) { this.main = main; this.pos = 0; }
private UInt64 pos; // mutable state internal DummyIntStreamDecompressor(CachedStream main) { this.main = main; this.pos = 0; }
internal void Unload() { this.bytes = null; this.idxUrls = null; this.idxOffsets = null; }
internal void Load() { Contract.Assert(this.numBytes >= 0); var sw = System.Diagnostics.Stopwatch.StartNew(); using (var stream = new BufferedStream(new FileStream(this.cell.fileName, FileMode.Open, FileAccess.Read))) { stream.Seek(this.startPos, SeekOrigin.Begin); this.bytes = new CachedStream(stream, (ulong)this.numBytes); } var secs1 = 0.001 * sw.ElapsedMilliseconds; sw.Restart(); // Given the data in this.bytes, construct idxUrls and idxOffsets var numIdxItems = (int)((this.cell.numUrls + this.indexStride - 1) / this.indexStride); // Note that for numUrls=0 ^^this^^ is NOT the same as (int)((this.cell.numUrls - 1) / this.indexStride) + 1 !! this.idxUrls = new byte[numIdxItems][]; this.idxOffsets = new ulong[numIdxItems]; ulong pos = 0; // position in this.bytes -- starts at 0 ulong lastPos = 0; var res = new byte[0]; int resLen = 0; long cuid = 0; while (cuid < this.cell.numUrls) { int prefLen = this.ReadCompressedSize(ref pos); int suffLen = this.ReadCompressedSize(ref pos); resLen = prefLen + suffLen; // Enlarge the result buffer if necessary if (resLen > res.Length) { var tmp = new byte[resLen]; for (int i = 0; i < prefLen; i++) { // care only about first prefLen bytes tmp[i] = res[i]; } res = tmp; } // Read the suffix for (int i = prefLen; i < resLen; i++) { res[i] = this.bytes.GetUInt8(pos + (ulong)(i - prefLen)); } pos += (ulong)suffLen; if (++cuid % indexStride == 0) { // Save an index item int idx = (int)(cuid / indexStride) - 1; idxUrls[idx] = SubArray(res, 0, resLen); idxOffsets[idx] = lastPos; lastPos = pos; } } // Finally, store a sentinel if (cuid % indexStride != 0) { idxUrls[numIdxItems - 1] = SubArray(res, 0, resLen); idxOffsets[numIdxItems - 1] = lastPos; } var secs2 = 0.001 * sw.ElapsedMilliseconds; //Console.Error.WriteLine("PERF: Cell {0} url portion: Loading took {1} seconds, indexing took {2} seconds", this.cell.fileName, secs1, secs2); Contract.ForAll(this.idxUrls, url => url != null); }
private byte[][] idxUrls; // each byte[] is logically a UTF8-encoded string #endregion Fields #region Constructors internal UrlCell(Cell cell, BinaryReader rd) { this.numBytes = rd.ReadInt64(); this.indexStride = rd.ReadInt32(); this.cell = cell; this.startPos = -1; this.idxUrls = null; this.idxOffsets = null; this.bytes = null; this.hashToUidCache = null; this.hasher = new Hash64(); }
internal void Load() { var sw = Stopwatch.StartNew(); // Somewhat of a hack: If this LinkCell is not yet sealed, return immediately. // This is OK as long as noone tries to use the cell subsequently. // A better solution would be not to call Load on unsealed LinkCell objects. if (this.numBytes == -1) return; using (var rd = new BinaryReader(new BufferedStream(new FileStream(this.cell.fileName, FileMode.Open, FileAccess.Read, FileShare.Read)))) { rd.BaseStream.Seek(this.startPos, SeekOrigin.Begin); this.bytes = new CachedStream(rd.BaseStream, (ulong)this.numBytes); } var secs1 = 0.001 * sw.ElapsedMilliseconds; sw.Restart(); // Construct idxOffsets from main var supraPuid = this.cell.part.ping.PUID(this.cell.supraUID); long numIdxItems = (int)((supraPuid - 1) / this.indexStride) + 1; this.idxOffsets = new ulong[numIdxItems]; var decompressor = this.NewDecompressor(); int idx = 0; for (long puid = 0; puid < supraPuid; puid++) { if (puid % this.indexStride == 0) { idxOffsets[idx++] = decompressor.GetPosition(); } uint m = decompressor.GetUInt32(); for (uint j = 0; j < m; j++) { var linkUid = decompressor.GetUInt64(); // don't care that first gap is signed } } var secs2 = 0.001 * sw.ElapsedMilliseconds; //Console.Error.WriteLine("PERF: Cell {0} {1} portion: Loading took {2} seconds, indexing took {3} seconds", this.cell.fileName, this == this.cell.fwdCell ? "fwd" : "bwd", secs1, secs2); }
internal LinkCell(Cell cell, BinaryReader rd) { this.numBytes = rd.ReadInt64(); this.numLinks = rd.ReadInt64(); this.indexStride = rd.ReadInt32(); this.compressionCode = (LinkCompression)rd.ReadUInt32(); this.cell = cell; this.startPos = -1; this.idxOffsets = null; this.bytes = null; }