public override void Scan() { target.Position = 0; StreamBrake brake = new StreamBrake(target); //GZIP Streams need to be wrapped, otherwise they won't play nicely... long counter = 0; string warcVersion; string temp = ""; List <WarcRequest> requests = new List <WarcRequest>(); List <WarcResponse> responses = new List <WarcResponse>(); Debug.Write("Parsing WARC.."); while (target.Position != target.Length) { if (counter++ % 2000 == 0) { Debug.Write("."); } WarcInfo info = null; GzipLocation location = new GzipLocation(target, brake.Position); GZipStream gzStream = new GZipStream(brake, CompressionMode.Decompress); StreamReader str = new StreamReader(gzStream, Encoding.ASCII); IIndexer.HandleEntry(str, location, requests, responses); while (str.ReadLine() != null) { ; } } WarcRequests = requests; WarcResponses = responses; }
protected static void HandleEntry(StreamReader str, GzipLocation location, List <WarcRequest> requests, List <WarcResponse> responses) { WarcInfo info = null; string warcVersion; warcVersion = str.ReadLine(); if (!warcVersion.StartsWith("WARC")) { throw new InvalidDataException("doesn't look like a WARC file to me."); } string type = str.ReadLine().Split(' ')[1]; switch (type) { case "warcinfo": if (info != null) { throw new Exception("Sorry, megawarcs are not yet supported."); } info = new WarcInfo(str); info.GzipLocation = location; break; case "request": WarcRequest requ = new WarcRequest(str); requ.GzipLocation = location; requests.Add(requ); break; case "response": WarcResponse resp = new WarcResponse(str); resp.GzipLocation = location; responses.Add(resp); break; case "metadata": new WarcMetadata(str); //I don't care about metadata, so I'll just discard these. break; case "resource": new WarcResource(str); //The same applies to resources. break; default: throw new NotImplementedException(string.Format("Don't know what {0} means.", type)); } }