public static void GenerateCdx(string cdx, IEnumerable <string> warcs) { var scratchpad = new Scratchpad(); var buf = new byte[16 * 1024]; using (var output = File.Open(cdx + ".tmp", FileMode.Create, FileAccess.Write, FileShare.Delete | FileShare.Read)) { using (var writer = new Utf8StreamWriter(output)) { writer.WriteClrStringLine(WarcColumns); foreach (var warc in warcs) { Console.WriteLine(Path.GetFileName(warc)); using (var warcStream = File.Open(warc, FileMode.Open, FileAccess.Read, FileShare.Delete | FileShare.Read)) { try { var warcname = new Utf8String(Path.GetFileName(warc)); while (warcStream.Position != warcStream.Length) { var startPosition = warcStream.Position; long warcContentLength = -1; long payloadLength = -1; int responseCode = -1; var contentType = Utf8String.Empty; var date = scratchpad.Use(14); date[0] = 0; Utf8String url = Utf8String.Empty; Utf8String shamanUrl = Utf8String.Empty; DateTime? lastModified = null; bool isresponse = false; using (var gz = new GZipStream(warcStream, CompressionMode.Decompress, true)) { using (var reader = new Utf8StreamReader(gz, true)) { while (true) { if (reader.IsCompleted) { throw new EndOfStreamException(); } var line = reader.ReadLine(); if (line.Length == 0) { break; } if (line.Equals(Warc_Response)) { isresponse = true; } if (line.StartsWith(Warc_ContentLength)) { warcContentLength = Utf8Utils.ParseInt64(WarcItem.GetHeaderValue(line)); } else if (line.StartsWith(Warc_Date)) { var val = WarcItem.GetHeaderValue(line).Bytes; val.Slice(0, 4).CopyTo(date.Slice(0)); val.Slice(5, 2).CopyTo(date.Slice(4)); val.Slice(8, 2).CopyTo(date.Slice(6)); val.Slice(11, 2).CopyTo(date.Slice(8)); val.Slice(14, 2).CopyTo(date.Slice(10)); val.Slice(17, 2).CopyTo(date.Slice(12)); } else if (line.StartsWith(Warc_URL)) { url = scratchpad.Copy(WarcItem.GetHeaderValue(line)); } if (line.StartsWith(Warc_Shaman_URI)) { shamanUrl = scratchpad.Copy(WarcItem.GetHeaderValue(line)); } } if (warcContentLength == -1) { throw new InvalidOperationException(); } if (isresponse) { using (var s = WarcItem.OpenHttp(reader, scratchpad, new Uri(url.ToString()), warcContentLength, out var payloadLengthFromHeader, out var redirectLocation, out responseCode, out contentType, out lastModified, null)) { long l = 0; while (true) { var m = s.Read(buf, 0, buf.Length); if (m == 0) { break; } l += m; } payloadLength = l; if (payloadLengthFromHeader != -1 && payloadLengthFromHeader != payloadLength) { throw new Exception("Content-Length mismatch."); } } //var httpData = new LimitedStream(reader, contentLength); var cr = reader.ReadByte(); if (cr != 13) { throw new InvalidDataException(); } var lf = reader.ReadByte(); if (lf != 10) { throw new InvalidDataException(); } cr = reader.ReadByte(); if (cr != 13) { throw new InvalidDataException(); } lf = reader.ReadByte(); if (lf != 10) { throw new InvalidDataException(); } //if (reader.ReadByte() != 13) throw new Exception(); //if (reader.ReadByte() != 10) throw new Exception(); } else { var remaining = warcContentLength; while (remaining != 0) { var m = reader.Read((int)Math.Min(remaining, int.MaxValue)); if (m.Count == 0) { throw new Exception(); } remaining -= m.Count; } var e = reader.ReadLine(); if (e.Length != 0) { throw new InvalidDataException(); } e = reader.ReadLine(); if (e.Length != 0) { throw new InvalidDataException(); } e = reader.ReadLine(); if (!reader.IsCompleted) { throw new InvalidDataException(); } } //var r = reader.RemainingBufferedData; var end = reader.ReadByte(); if (end != -1) { throw new InvalidDataException(); } //Console.WriteLine($"Remaining: {r.Length}"); } warcStream.Position -= GetRemainingUnusedBytes(gz); } if (isresponse) { if (shamanUrl.Length > 0) { writer.Write(shamanUrl); } else { writer.Write(url); } writer.Write((byte)' '); writer.Write(startPosition); writer.Write((byte)' '); writer.Write(warcStream.Position - startPosition); writer.Write((byte)' '); if (date[0] != 0) { writer.Write(date); } else { writer.Write((byte)'-'); } writer.Write((byte)' '); writer.Write(warcname); writer.Write((byte)' '); if (responseCode != -1) { writer.Write(responseCode); } else { writer.Write((byte)'-'); } writer.Write((byte)' '); writer.Write(contentType); writer.Write((byte)' '); writer.Write(payloadLength); writer.Write((byte)' '); if (lastModified != null) { WriteDate(writer, lastModified.Value); } else { writer.Write((byte)'-'); } writer.Write((byte)' '); writer.WriteLine(); } scratchpad.Reset(); } } catch { if (warcStream.Position == warcStream.Length) { Console.WriteLine("WARNING: truncated WARC.");; } else { throw; } } } } } } File.Delete(cdx); File.Move(cdx + ".tmp", cdx); }