public override void Scan() { target.Position = 0; StreamBrake brake = new StreamBrake(target); //GZIP Streams need to be wrapped, otherwise they won't play nicely... long counter = 0; string warcVersion; string temp = ""; List <WarcRequest> requests = new List <WarcRequest>(); List <WarcResponse> responses = new List <WarcResponse>(); Debug.Write("Parsing WARC.."); while (target.Position != target.Length) { if (counter++ % 2000 == 0) { Debug.Write("."); } WarcInfo info = null; GzipLocation location = new GzipLocation(target, brake.Position); GZipStream gzStream = new GZipStream(brake, CompressionMode.Decompress); StreamReader str = new StreamReader(gzStream, Encoding.ASCII); IIndexer.HandleEntry(str, location, requests, responses); while (str.ReadLine() != null) { ; } } WarcRequests = requests; WarcResponses = responses; }
protected static void HandleEntry(StreamReader str, GzipLocation location, List <WarcRequest> requests, List <WarcResponse> responses) { WarcInfo info = null; string warcVersion; warcVersion = str.ReadLine(); if (!warcVersion.StartsWith("WARC")) { throw new InvalidDataException("doesn't look like a WARC file to me."); } string type = str.ReadLine().Split(' ')[1]; switch (type) { case "warcinfo": if (info != null) { throw new Exception("Sorry, megawarcs are not yet supported."); } info = new WarcInfo(str); info.GzipLocation = location; break; case "request": WarcRequest requ = new WarcRequest(str); requ.GzipLocation = location; requests.Add(requ); break; case "response": WarcResponse resp = new WarcResponse(str); resp.GzipLocation = location; responses.Add(resp); break; case "metadata": new WarcMetadata(str); //I don't care about metadata, so I'll just discard these. break; case "resource": new WarcResource(str); //The same applies to resources. break; default: throw new NotImplementedException(string.Format("Don't know what {0} means.", type)); } }
public override void Scan() { List <WarcRequest> requests = new List <WarcRequest>(); List <WarcResponse> responses = new List <WarcResponse>(); parentStream.Position = 0; GzipLocation location; while (parentStream.Position != parentStream.Length) { location = new GzipLocation(parentStream, parentStream.Position); BinaryReader br = new BinaryReader(parentStream); if (br.ReadUInt16() != GzipUtils.GzipMagicNumber) { throw new InvalidDataException("GZIP Header not found."); } method = (CompressionMethod)br.ReadByte(); byte mask = br.ReadByte(); isText = (mask & 0x01) != 0; hasChecksum = (mask & 0x02) != 0; hasExtraFields = (mask & 0x04) != 0; hasOriginalFileNameString = (mask & 0x08) != 0; containsComment = (mask & 0x10) != 0; lastModificationTime = br.ReadUInt32(); extraFlags = br.ReadByte(); operatingSystem = (GzipOperatingSystem)br.ReadByte(); if (hasExtraFields) { extraSize = br.ReadUInt16(); extraDataOffset = br.BaseStream.Position; if (extraSize != 12) { throw new InvalidDataException("That doesn't look like a WARC file, because the Gzip Extra Data Length seems uncommon."); } extraData = new WARC_EXTRA_DATA(br.ReadBytes(extraSize)); } if (hasOriginalFileNameString) { originalFilename = readNullTerminatedString(br); } if (containsComment) { comment = readNullTerminatedString(br); } if (hasChecksum) { crc = br.ReadUInt16(); } DeflateStream ds = new DeflateStream(parentStream, CompressionMode.Decompress); StreamReader sr = new StreamReader(ds, Encoding.ASCII); IIndexer.HandleEntry(sr, location, requests, responses); br.BaseStream.Position = location.offset + (CompressedSize - 8); crc32 = br.ReadUInt32(); uncompressedSize = br.ReadUInt32(); } WarcResponses = responses; WarcRequests = requests; }