public WarcItem ToWarcItem(string folder, ref byte[] fileNameBytes, ref string fileNameString) { string fn; if (fileNameBytes != null && fileNameBytes.Length == this.FileName.Length && this.FileName.Bytes.BlockEquals((ReadOnlySpan <byte>)fileNameBytes.Slice())) { fn = fileNameString; } else { fileNameBytes = new byte[this.FileName.Length]; this.FileName.Bytes.CopyTo(fileNameBytes); fileNameString = Path.Combine(folder, this.FileName.ToString()); fn = fileNameString; } return(new WarcItem() { Url = this.OriginalUrl.ToString(), CompressedOffset = Utf8Utils.ParseInt64(this.CompressedArcFileOffset), CompressedLength = Utf8Utils.ParseInt64(this.CompressedRecordSize), Date = ParseDate(this.Date), PayloadLength = this.PayloadLength.Length != 0 ? Utf8Utils.ParseInt64(this.PayloadLength) : -1, WarcFile = fn, LastModified = this.LastModified.Length > 1 ? ParseDate(this.LastModified) : (DateTime?)null, ContentType = this.MimeTypeOfOriginalDocument.ToStringCached(), ResponseCode = this.ResponseCode.Length != 0 ? (HttpStatusCode)Utf8Utils.ParseInt32(this.ResponseCode) : default(HttpStatusCode), }); }
private DateTime ParseDate(Utf8String date) { if (date.Length == 14) { return(Utf8Utils.ParseDateConcatenated(date)); } var num = Utf8Utils.ParseInt64(date); if (num < 1980_00_00_000000) { return(new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc).AddSeconds(num)); } throw new ArgumentException(); }
internal static DateTime ParseHttpDate(Utf8String str) { Utf8Utils.ReadTo(ref str, (byte)' '); var day = Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)' ')); var month = ParseMonth(Utf8Utils.ReadTo(ref str, (byte)' ')); var year = Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)' ')); //str.Split((byte)' ', StringSplitOptions.None, ref arr); return(new DateTime( year, month, day, Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)':')), Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)':')), Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)' ')), DateTimeKind.Utc )); }
public WarcFs(string cdx) { this.cdx = cdx; byte[] fileNameBytes = null; string fileNameString = null; var folder = Path.GetDirectoryName(cdx); this.Root = CreateTree <WarcItem>(WarcCdxItemRaw.Read(cdx).Select(x => { var response = x.ResponseCode; if (response.Length != 0) { var responseCode = Utf8Utils.ParseInt32(response); if (responseCode < 200 || responseCode >= 300) { return(null); } } return(x.ToWarcItem(folder, ref fileNameBytes, ref fileNameString)); }).Where(x => x != null), x => { var url = new Uri(x.Url); var keep = -1; if (url.AbsolutePath.StartsWith("/w/images/")) { keep = 2; } else if (url.AbsolutePath.StartsWith("/wiki/")) { keep = 1; } else if (url.Host.EndsWith(".fbcdn.net")) { keep = 0; } else if (url.Host.EndsWith(".media.tumblr.com")) { keep = 0; } else if (url.Host.EndsWith(".bp.blogspot.com")) { keep = 0; } else if (url.Host.EndsWith(".reddit.com") && url.AbsolutePath.Contains("/comments/")) { keep = 3; } else if (url.Host.EndsWith(".staticflickr.com")) { keep = 0; } else if (url.Host.EndsWith(".giphy.com") && url.Host.Contains("media")) { keep = 0; } var path = WebsiteScraper.GetPathInternal(null, url, x.ContentType, keep); path = path.Replace('/', '\\'); if (path.Length > 150) { var z = path.IndexOf('‽'); if (z != -1) { path = path.Substring(0, z) + "‽{" + Math.Abs((long)path.GetHashCode()) + "}" + Path.GetExtension(path); } } if (url.IsHostedOn("facebook.com") && url.AbsolutePath.StartsWith("/pages_reaction_units/")) { path = path.TrimEnd(".js"); path += ".html"; } return(path); }, null, x => { x.Tag = TagVirtual; if (x.Info != null) { urlToFsNode[x.Info.Url] = x; } }); FsNode <WarcItem> rawRoot = null; rawRoot = new FsNode <WarcItem>() { Name = "_raw", GetChildrenDelegate = CreateGetChildrenDelegate(this.Root) }; Func <List <FsNode <WarcItem> > > CreateGetChildrenDelegate(FsNode <WarcItem> reference) { if (reference.Children == null) { return(() => null); } return(new Func <List <FsNode <WarcItem> > >(() => { return reference.Children.Where(x => x != rawRoot).Select(x => { var k = new FsNode <WarcItem>() { Info = x.Info, Name = x.Name, GetChildrenDelegate = CreateGetChildrenDelegate(x), Tag = null, FullName = x.FullName != null ? "_raw\\" + x.FullName : null }; return k; }).ToList(); })); } this.Root.Children.Add(rawRoot); cache = new MemoryStreamCache <FsNode <WarcItem> >((item, dest) => { if (item.Tag == TagVirtual) { var ct = item.Info.ContentType; if (ct != null && ct.Contains("/html") || item.Info.Url.Contains("facebook.com/pages_reaction_units/")) { HtmlNode doc; var pagePath = item.FullName; if (item.Info.Url.Contains("/pages_reaction_units/")) { var jsontext = item.Info.ReadText(); var idx = jsontext.IndexOf('{'); var json = (JObject)HttpUtils.ReadJsonToken(jsontext, idx); doc = new HtmlDocument("<!doctype html><html><head><meta charset=\"utf-8\"></head><body></body></html>").DocumentNode; doc.OwnerDocument.SetPageUrl(item.Info.Url.AsUri()); var body = doc.Descendants("body").First(); foreach (var domop in (JArray)json["domops"]) { var html = ((JArray)domop).First(x => x is JObject)["__html"].Value <string>(); body.AppendChild(html.AsHtmlNode()); } } else { doc = item.Info.ReadHtml(); } ProcessHtml(ref doc, pagePath); var simpleStyle = doc.OwnerDocument.CreateElement("link"); simpleStyle.SetAttributeValue("rel", "stylesheet"); simpleStyle.SetAttributeValue("href", @"file:///C:\Users\Andrea\Desktop\facebook-simple-css.css"); (doc.FindSingle("head") ?? doc).AppendChild(simpleStyle); using (var sw = new StreamWriter(dest, Encoding.UTF8, 16 * 1024, true)) { doc.WriteTo(sw); } return; } } using (var k = item.Info.OpenStream()) { k.CopyTo(dest); } }); }
public static void GenerateCdx(string cdx, IEnumerable <string> warcs) { var scratchpad = new Scratchpad(); var buf = new byte[16 * 1024]; using (var output = File.Open(cdx + ".tmp", FileMode.Create, FileAccess.Write, FileShare.Delete | FileShare.Read)) { using (var writer = new Utf8StreamWriter(output)) { writer.WriteClrStringLine(WarcColumns); foreach (var warc in warcs) { Console.WriteLine(Path.GetFileName(warc)); using (var warcStream = File.Open(warc, FileMode.Open, FileAccess.Read, FileShare.Delete | FileShare.Read)) { try { var warcname = new Utf8String(Path.GetFileName(warc)); while (warcStream.Position != warcStream.Length) { var startPosition = warcStream.Position; long warcContentLength = -1; long payloadLength = -1; int responseCode = -1; var contentType = Utf8String.Empty; var date = scratchpad.Use(14); date[0] = 0; Utf8String url = Utf8String.Empty; Utf8String shamanUrl = Utf8String.Empty; DateTime? lastModified = null; bool isresponse = false; using (var gz = new GZipStream(warcStream, CompressionMode.Decompress, true)) { using (var reader = new Utf8StreamReader(gz, true)) { while (true) { if (reader.IsCompleted) { throw new EndOfStreamException(); } var line = reader.ReadLine(); if (line.Length == 0) { break; } if (line.Equals(Warc_Response)) { isresponse = true; } if (line.StartsWith(Warc_ContentLength)) { warcContentLength = Utf8Utils.ParseInt64(WarcItem.GetHeaderValue(line)); } else if (line.StartsWith(Warc_Date)) { var val = WarcItem.GetHeaderValue(line).Bytes; val.Slice(0, 4).CopyTo(date.Slice(0)); val.Slice(5, 2).CopyTo(date.Slice(4)); val.Slice(8, 2).CopyTo(date.Slice(6)); val.Slice(11, 2).CopyTo(date.Slice(8)); val.Slice(14, 2).CopyTo(date.Slice(10)); val.Slice(17, 2).CopyTo(date.Slice(12)); } else if (line.StartsWith(Warc_URL)) { url = scratchpad.Copy(WarcItem.GetHeaderValue(line)); } if (line.StartsWith(Warc_Shaman_URI)) { shamanUrl = scratchpad.Copy(WarcItem.GetHeaderValue(line)); } } if (warcContentLength == -1) { throw new InvalidOperationException(); } if (isresponse) { using (var s = WarcItem.OpenHttp(reader, scratchpad, new Uri(url.ToString()), warcContentLength, out var payloadLengthFromHeader, out var redirectLocation, out responseCode, out contentType, out lastModified, null)) { long l = 0; while (true) { var m = s.Read(buf, 0, buf.Length); if (m == 0) { break; } l += m; } payloadLength = l; if (payloadLengthFromHeader != -1 && payloadLengthFromHeader != payloadLength) { throw new Exception("Content-Length mismatch."); } } //var httpData = new LimitedStream(reader, contentLength); var cr = reader.ReadByte(); if (cr != 13) { throw new InvalidDataException(); } var lf = reader.ReadByte(); if (lf != 10) { throw new InvalidDataException(); } cr = reader.ReadByte(); if (cr != 13) { throw new InvalidDataException(); } lf = reader.ReadByte(); if (lf != 10) { throw new InvalidDataException(); } //if (reader.ReadByte() != 13) throw new Exception(); //if (reader.ReadByte() != 10) throw new Exception(); } else { var remaining = warcContentLength; while (remaining != 0) { var m = reader.Read((int)Math.Min(remaining, int.MaxValue)); if (m.Count == 0) { throw new Exception(); } remaining -= m.Count; } var e = reader.ReadLine(); if (e.Length != 0) { throw new InvalidDataException(); } e = reader.ReadLine(); if (e.Length != 0) { throw new InvalidDataException(); } e = reader.ReadLine(); if (!reader.IsCompleted) { throw new InvalidDataException(); } } //var r = reader.RemainingBufferedData; var end = reader.ReadByte(); if (end != -1) { throw new InvalidDataException(); } //Console.WriteLine($"Remaining: {r.Length}"); } warcStream.Position -= GetRemainingUnusedBytes(gz); } if (isresponse) { if (shamanUrl.Length > 0) { writer.Write(shamanUrl); } else { writer.Write(url); } writer.Write((byte)' '); writer.Write(startPosition); writer.Write((byte)' '); writer.Write(warcStream.Position - startPosition); writer.Write((byte)' '); if (date[0] != 0) { writer.Write(date); } else { writer.Write((byte)'-'); } writer.Write((byte)' '); writer.Write(warcname); writer.Write((byte)' '); if (responseCode != -1) { writer.Write(responseCode); } else { writer.Write((byte)'-'); } writer.Write((byte)' '); writer.Write(contentType); writer.Write((byte)' '); writer.Write(payloadLength); writer.Write((byte)' '); if (lastModified != null) { WriteDate(writer, lastModified.Value); } else { writer.Write((byte)'-'); } writer.Write((byte)' '); writer.WriteLine(); } scratchpad.Reset(); } } catch { if (warcStream.Position == warcStream.Length) { Console.WriteLine("WARNING: truncated WARC.");; } else { throw; } } } } } } File.Delete(cdx); File.Move(cdx + ".tmp", cdx); }
public static Stream OpenHttp(Utf8StreamReader httpReader, Scratchpad scratchpad, Uri requestedUrl, long responseLength, out long payloadLength, out Uri location, out int responseCode, out Utf8String contentType, out DateTime?lastModified, Action <Utf8String, Utf8String> onHttpHeader) { var startPosition = httpReader.Position; payloadLength = -1; location = null; lastModified = null; bool chunked = false; bool gzipped = false; bool brotli = false; var responseLine = httpReader.ReadLine(); responseCode = (int)Utf8Utils.ParseInt64(responseLine.TryCaptureBetween((byte)' ', (byte)' ') ?? responseLine.CaptureAfter((byte)' ')); while (true) { var line = httpReader.ReadLine(); if (httpReader.IsCompleted) { throw new InvalidDataException(); } if (line.Length == 0) { break; } if (onHttpHeader != null) { var d = line.IndexOf((byte)':'); onHttpHeader(line.Substring(0, d).Trim(), line.Substring(d + 1).Trim()); } if (line.StartsWith(Http_TransferEncoding)) { var value = GetHeaderValue(line); if (value.Equals("chunked")) { chunked = true; } } else if (line.StartsWith(Http_ContentLength)) { payloadLength = Utf8Utils.ParseInt64(GetHeaderValue(line)); } else if (line.StartsWith(Http_ContentEncoding)) { var value = GetHeaderValue(line); if (value == Http_Gzip) { gzipped = true; } else if (value == Http_Brotli) { brotli = true; } } else if (line.StartsWith(Http_Location)) { var val = GetHeaderValue(line).ToString(); try { if (val.StartsWith("//")) { location = new Uri(requestedUrl.Scheme + ":" + val); } else { location = new Uri(requestedUrl, val); } } catch (Exception ex) { } } else if (line.StartsWith(Http_ContentType) && scratchpad != null) { var value = GetHeaderValue(line); value = value.TryCaptureBefore((byte)' ') ?? value; value = value.TryCaptureBefore((byte)';') ?? value; contentType = scratchpad.Copy(value); } else if (line.StartsWith(Http_LastModified)) { try { lastModified = WarcCdxItemRaw.ParseHttpDate(GetHeaderValue(line)); } catch { } } } var compressed = gzipped || brotli; if (compressed || chunked) { payloadLength = -1; } Stream s; if (responseLength != -1) { var currentPos = httpReader.Position - startPosition; var httpBodyLength = responseLength - currentPos; if (!compressed && !chunked && payloadLength != -1 && httpBodyLength != payloadLength) { throw new Exception("Unexpected Content-Length."); } s = new LimitedStream(httpReader, httpBodyLength); } else { s = httpReader; } if (chunked) { s = new ChunkedStream(s); } if (compressed && chunked) { s = new OnDisposeConsumeStream(s); } if (gzipped) { s = new GZipStream(s, CompressionMode.Decompress); } else if (brotli) { s = new BrotliStream(s, CompressionMode.Decompress); } return(s); }
private static long ReadWarcRecordContentLength(Utf8String warcHeader) { return(Utf8Utils.ParseInt64(warcHeader.CaptureBetween((Utf8String)"Content-Length:", (Utf8String)"\n").Trim())); }