Ejemplo n.º 1
0
        public WarcItem ToWarcItem(string folder, ref byte[] fileNameBytes, ref string fileNameString)
        {
            string fn;

            if (fileNameBytes != null && fileNameBytes.Length == this.FileName.Length && this.FileName.Bytes.BlockEquals((ReadOnlySpan <byte>)fileNameBytes.Slice()))
            {
                fn = fileNameString;
            }
            else
            {
                fileNameBytes = new byte[this.FileName.Length];
                this.FileName.Bytes.CopyTo(fileNameBytes);
                fileNameString = Path.Combine(folder, this.FileName.ToString());
                fn             = fileNameString;
            }

            return(new WarcItem()
            {
                Url = this.OriginalUrl.ToString(),
                CompressedOffset = Utf8Utils.ParseInt64(this.CompressedArcFileOffset),
                CompressedLength = Utf8Utils.ParseInt64(this.CompressedRecordSize),
                Date = ParseDate(this.Date),
                PayloadLength = this.PayloadLength.Length != 0 ? Utf8Utils.ParseInt64(this.PayloadLength) : -1,
                WarcFile = fn,
                LastModified = this.LastModified.Length > 1 ? ParseDate(this.LastModified) : (DateTime?)null,
                ContentType = this.MimeTypeOfOriginalDocument.ToStringCached(),
                ResponseCode = this.ResponseCode.Length != 0 ? (HttpStatusCode)Utf8Utils.ParseInt32(this.ResponseCode) : default(HttpStatusCode),
            });
        }
Ejemplo n.º 2
0
        private DateTime ParseDate(Utf8String date)
        {
            if (date.Length == 14)
            {
                return(Utf8Utils.ParseDateConcatenated(date));
            }
            var num = Utf8Utils.ParseInt64(date);

            if (num < 1980_00_00_000000)
            {
                return(new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc).AddSeconds(num));
            }
            throw new ArgumentException();
        }
Ejemplo n.º 3
0
        internal static DateTime ParseHttpDate(Utf8String str)
        {
            Utf8Utils.ReadTo(ref str, (byte)' ');
            var day   = Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)' '));
            var month = ParseMonth(Utf8Utils.ReadTo(ref str, (byte)' '));
            var year  = Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)' '));

            //str.Split((byte)' ', StringSplitOptions.None, ref arr);
            return(new DateTime(
                       year,
                       month,
                       day,
                       Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)':')),
                       Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)':')),
                       Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)' ')),
                       DateTimeKind.Utc
                       ));
        }
Ejemplo n.º 4
0
        public WarcFs(string cdx)
        {
            this.cdx = cdx;
            byte[] fileNameBytes  = null;
            string fileNameString = null;
            var    folder         = Path.GetDirectoryName(cdx);

            this.Root = CreateTree <WarcItem>(WarcCdxItemRaw.Read(cdx).Select(x =>
            {
                var response = x.ResponseCode;
                if (response.Length != 0)
                {
                    var responseCode = Utf8Utils.ParseInt32(response);
                    if (responseCode < 200 || responseCode >= 300)
                    {
                        return(null);
                    }
                }
                return(x.ToWarcItem(folder, ref fileNameBytes, ref fileNameString));
            }).Where(x => x != null), x =>
            {
                var url = new Uri(x.Url);

                var keep = -1;
                if (url.AbsolutePath.StartsWith("/w/images/"))
                {
                    keep = 2;
                }
                else if (url.AbsolutePath.StartsWith("/wiki/"))
                {
                    keep = 1;
                }
                else if (url.Host.EndsWith(".fbcdn.net"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".media.tumblr.com"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".bp.blogspot.com"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".reddit.com") && url.AbsolutePath.Contains("/comments/"))
                {
                    keep = 3;
                }
                else if (url.Host.EndsWith(".staticflickr.com"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".giphy.com") && url.Host.Contains("media"))
                {
                    keep = 0;
                }
                var path = WebsiteScraper.GetPathInternal(null, url, x.ContentType, keep);
                path     = path.Replace('/', '\\');

                if (path.Length > 150)
                {
                    var z = path.IndexOf('‽');
                    if (z != -1)
                    {
                        path = path.Substring(0, z) + "‽{" + Math.Abs((long)path.GetHashCode()) + "}" + Path.GetExtension(path);
                    }
                }

                if (url.IsHostedOn("facebook.com") && url.AbsolutePath.StartsWith("/pages_reaction_units/"))
                {
                    path  = path.TrimEnd(".js");
                    path += ".html";
                }

                return(path);
            }, null, x =>
            {
                x.Tag = TagVirtual;
                if (x.Info != null)
                {
                    urlToFsNode[x.Info.Url] = x;
                }
            });

            FsNode <WarcItem> rawRoot = null;

            rawRoot = new FsNode <WarcItem>()
            {
                Name = "_raw", GetChildrenDelegate = CreateGetChildrenDelegate(this.Root)
            };
            Func <List <FsNode <WarcItem> > > CreateGetChildrenDelegate(FsNode <WarcItem> reference)
            {
                if (reference.Children == null)
                {
                    return(() => null);
                }
                return(new Func <List <FsNode <WarcItem> > >(() =>
                {
                    return reference.Children.Where(x => x != rawRoot).Select(x =>
                    {
                        var k = new FsNode <WarcItem>()
                        {
                            Info = x.Info,
                            Name = x.Name,
                            GetChildrenDelegate = CreateGetChildrenDelegate(x),
                            Tag = null,
                            FullName = x.FullName != null ? "_raw\\" + x.FullName : null
                        };
                        return k;
                    }).ToList();
                }));
            }

            this.Root.Children.Add(rawRoot);


            cache = new MemoryStreamCache <FsNode <WarcItem> >((item, dest) =>
            {
                if (item.Tag == TagVirtual)
                {
                    var ct = item.Info.ContentType;
                    if (ct != null && ct.Contains("/html") || item.Info.Url.Contains("facebook.com/pages_reaction_units/"))
                    {
                        HtmlNode doc;
                        var pagePath = item.FullName;
                        if (item.Info.Url.Contains("/pages_reaction_units/"))
                        {
                            var jsontext = item.Info.ReadText();
                            var idx      = jsontext.IndexOf('{');
                            var json     = (JObject)HttpUtils.ReadJsonToken(jsontext, idx);
                            doc          = new HtmlDocument("<!doctype html><html><head><meta charset=\"utf-8\"></head><body></body></html>").DocumentNode;
                            doc.OwnerDocument.SetPageUrl(item.Info.Url.AsUri());
                            var body = doc.Descendants("body").First();

                            foreach (var domop in (JArray)json["domops"])
                            {
                                var html = ((JArray)domop).First(x => x is JObject)["__html"].Value <string>();
                                body.AppendChild(html.AsHtmlNode());
                            }
                        }
                        else
                        {
                            doc = item.Info.ReadHtml();
                        }
                        ProcessHtml(ref doc, pagePath);
                        var simpleStyle = doc.OwnerDocument.CreateElement("link");
                        simpleStyle.SetAttributeValue("rel", "stylesheet");
                        simpleStyle.SetAttributeValue("href", @"file:///C:\Users\Andrea\Desktop\facebook-simple-css.css");
                        (doc.FindSingle("head") ?? doc).AppendChild(simpleStyle);
                        using (var sw = new StreamWriter(dest, Encoding.UTF8, 16 * 1024, true))
                        {
                            doc.WriteTo(sw);
                        }
                        return;
                    }
                }

                using (var k = item.Info.OpenStream())
                {
                    k.CopyTo(dest);
                }
            });
        }
Ejemplo n.º 5
0
        public static void GenerateCdx(string cdx, IEnumerable <string> warcs)
        {
            var scratchpad = new Scratchpad();
            var buf        = new byte[16 * 1024];

            using (var output = File.Open(cdx + ".tmp", FileMode.Create, FileAccess.Write, FileShare.Delete | FileShare.Read))
            {
                using (var writer = new Utf8StreamWriter(output))
                {
                    writer.WriteClrStringLine(WarcColumns);
                    foreach (var warc in warcs)
                    {
                        Console.WriteLine(Path.GetFileName(warc));

                        using (var warcStream = File.Open(warc, FileMode.Open, FileAccess.Read, FileShare.Delete | FileShare.Read))
                        {
                            try
                            {
                                var warcname = new Utf8String(Path.GetFileName(warc));
                                while (warcStream.Position != warcStream.Length)
                                {
                                    var  startPosition     = warcStream.Position;
                                    long warcContentLength = -1;
                                    long payloadLength     = -1;
                                    int  responseCode      = -1;
                                    var  contentType       = Utf8String.Empty;
                                    var  date = scratchpad.Use(14);
                                    date[0] = 0;
                                    Utf8String url          = Utf8String.Empty;
                                    Utf8String shamanUrl    = Utf8String.Empty;
                                    DateTime?  lastModified = null;
                                    bool       isresponse   = false;
                                    using (var gz = new GZipStream(warcStream, CompressionMode.Decompress, true))
                                    {
                                        using (var reader = new Utf8StreamReader(gz, true))
                                        {
                                            while (true)
                                            {
                                                if (reader.IsCompleted)
                                                {
                                                    throw new EndOfStreamException();
                                                }
                                                var line = reader.ReadLine();
                                                if (line.Length == 0)
                                                {
                                                    break;
                                                }
                                                if (line.Equals(Warc_Response))
                                                {
                                                    isresponse = true;
                                                }
                                                if (line.StartsWith(Warc_ContentLength))
                                                {
                                                    warcContentLength = Utf8Utils.ParseInt64(WarcItem.GetHeaderValue(line));
                                                }
                                                else if (line.StartsWith(Warc_Date))
                                                {
                                                    var val = WarcItem.GetHeaderValue(line).Bytes;
                                                    val.Slice(0, 4).CopyTo(date.Slice(0));
                                                    val.Slice(5, 2).CopyTo(date.Slice(4));
                                                    val.Slice(8, 2).CopyTo(date.Slice(6));
                                                    val.Slice(11, 2).CopyTo(date.Slice(8));
                                                    val.Slice(14, 2).CopyTo(date.Slice(10));
                                                    val.Slice(17, 2).CopyTo(date.Slice(12));
                                                }
                                                else if (line.StartsWith(Warc_URL))
                                                {
                                                    url = scratchpad.Copy(WarcItem.GetHeaderValue(line));
                                                }
                                                if (line.StartsWith(Warc_Shaman_URI))
                                                {
                                                    shamanUrl = scratchpad.Copy(WarcItem.GetHeaderValue(line));
                                                }
                                            }
                                            if (warcContentLength == -1)
                                            {
                                                throw new InvalidOperationException();
                                            }


                                            if (isresponse)
                                            {
                                                using (var s = WarcItem.OpenHttp(reader, scratchpad, new Uri(url.ToString()), warcContentLength, out var payloadLengthFromHeader, out var redirectLocation, out responseCode, out contentType, out lastModified, null))
                                                {
                                                    long l = 0;
                                                    while (true)
                                                    {
                                                        var m = s.Read(buf, 0, buf.Length);
                                                        if (m == 0)
                                                        {
                                                            break;
                                                        }
                                                        l += m;
                                                    }
                                                    payloadLength = l;
                                                    if (payloadLengthFromHeader != -1 && payloadLengthFromHeader != payloadLength)
                                                    {
                                                        throw new Exception("Content-Length mismatch.");
                                                    }
                                                }
                                                //var httpData = new LimitedStream(reader, contentLength);
                                                var cr = reader.ReadByte();
                                                if (cr != 13)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                var lf = reader.ReadByte();
                                                if (lf != 10)
                                                {
                                                    throw new InvalidDataException();
                                                }


                                                cr = reader.ReadByte();
                                                if (cr != 13)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                lf = reader.ReadByte();
                                                if (lf != 10)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                //if (reader.ReadByte() != 13) throw new Exception();
                                                //if (reader.ReadByte() != 10) throw new Exception();
                                            }
                                            else
                                            {
                                                var remaining = warcContentLength;
                                                while (remaining != 0)
                                                {
                                                    var m = reader.Read((int)Math.Min(remaining, int.MaxValue));
                                                    if (m.Count == 0)
                                                    {
                                                        throw new Exception();
                                                    }
                                                    remaining -= m.Count;
                                                }

                                                var e = reader.ReadLine();
                                                if (e.Length != 0)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                e = reader.ReadLine();
                                                if (e.Length != 0)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                                e = reader.ReadLine();
                                                if (!reader.IsCompleted)
                                                {
                                                    throw new InvalidDataException();
                                                }
                                            }

                                            //var r = reader.RemainingBufferedData;
                                            var end = reader.ReadByte();
                                            if (end != -1)
                                            {
                                                throw new InvalidDataException();
                                            }
                                            //Console.WriteLine($"Remaining: {r.Length}");
                                        }



                                        warcStream.Position -= GetRemainingUnusedBytes(gz);
                                    }



                                    if (isresponse)
                                    {
                                        if (shamanUrl.Length > 0)
                                        {
                                            writer.Write(shamanUrl);
                                        }
                                        else
                                        {
                                            writer.Write(url);
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(startPosition);
                                        writer.Write((byte)' ');
                                        writer.Write(warcStream.Position - startPosition);
                                        writer.Write((byte)' ');
                                        if (date[0] != 0)
                                        {
                                            writer.Write(date);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(warcname);
                                        writer.Write((byte)' ');
                                        if (responseCode != -1)
                                        {
                                            writer.Write(responseCode);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');
                                        writer.Write(contentType);
                                        writer.Write((byte)' ');
                                        writer.Write(payloadLength);
                                        writer.Write((byte)' ');
                                        if (lastModified != null)
                                        {
                                            WriteDate(writer, lastModified.Value);
                                        }
                                        else
                                        {
                                            writer.Write((byte)'-');
                                        }
                                        writer.Write((byte)' ');

                                        writer.WriteLine();
                                    }
                                    scratchpad.Reset();
                                }
                            }
                            catch
                            {
                                if (warcStream.Position == warcStream.Length)
                                {
                                    Console.WriteLine("WARNING: truncated WARC.");;
                                }
                                else
                                {
                                    throw;
                                }
                            }
                        }
                    }
                }
            }
            File.Delete(cdx);
            File.Move(cdx + ".tmp", cdx);
        }
Ejemplo n.º 6
0
        public static Stream OpenHttp(Utf8StreamReader httpReader, Scratchpad scratchpad, Uri requestedUrl, long responseLength, out long payloadLength, out Uri location, out int responseCode, out Utf8String contentType, out DateTime?lastModified, Action <Utf8String, Utf8String> onHttpHeader)
        {
            var startPosition = httpReader.Position;

            payloadLength = -1;
            location      = null;
            lastModified  = null;

            bool chunked      = false;
            bool gzipped      = false;
            bool brotli       = false;
            var  responseLine = httpReader.ReadLine();

            responseCode = (int)Utf8Utils.ParseInt64(responseLine.TryCaptureBetween((byte)' ', (byte)' ') ?? responseLine.CaptureAfter((byte)' '));
            while (true)
            {
                var line = httpReader.ReadLine();
                if (httpReader.IsCompleted)
                {
                    throw new InvalidDataException();
                }
                if (line.Length == 0)
                {
                    break;
                }
                if (onHttpHeader != null)
                {
                    var d = line.IndexOf((byte)':');
                    onHttpHeader(line.Substring(0, d).Trim(), line.Substring(d + 1).Trim());
                }
                if (line.StartsWith(Http_TransferEncoding))
                {
                    var value = GetHeaderValue(line);
                    if (value.Equals("chunked"))
                    {
                        chunked = true;
                    }
                }
                else if (line.StartsWith(Http_ContentLength))
                {
                    payloadLength = Utf8Utils.ParseInt64(GetHeaderValue(line));
                }
                else if (line.StartsWith(Http_ContentEncoding))
                {
                    var value = GetHeaderValue(line);
                    if (value == Http_Gzip)
                    {
                        gzipped = true;
                    }
                    else if (value == Http_Brotli)
                    {
                        brotli = true;
                    }
                }
                else if (line.StartsWith(Http_Location))
                {
                    var val = GetHeaderValue(line).ToString();
                    try
                    {
                        if (val.StartsWith("//"))
                        {
                            location = new Uri(requestedUrl.Scheme + ":" + val);
                        }
                        else
                        {
                            location = new Uri(requestedUrl, val);
                        }
                    }
                    catch (Exception ex)
                    {
                    }
                }
                else if (line.StartsWith(Http_ContentType) && scratchpad != null)
                {
                    var value = GetHeaderValue(line);
                    value       = value.TryCaptureBefore((byte)' ') ?? value;
                    value       = value.TryCaptureBefore((byte)';') ?? value;
                    contentType = scratchpad.Copy(value);
                }
                else if (line.StartsWith(Http_LastModified))
                {
                    try
                    {
                        lastModified = WarcCdxItemRaw.ParseHttpDate(GetHeaderValue(line));
                    }
                    catch { }
                }
            }

            var compressed = gzipped || brotli;

            if (compressed || chunked)
            {
                payloadLength = -1;
            }
            Stream s;

            if (responseLength != -1)
            {
                var currentPos     = httpReader.Position - startPosition;
                var httpBodyLength = responseLength - currentPos;

                if (!compressed && !chunked && payloadLength != -1 && httpBodyLength != payloadLength)
                {
                    throw new Exception("Unexpected Content-Length.");
                }
                s = new LimitedStream(httpReader, httpBodyLength);
            }
            else
            {
                s = httpReader;
            }
            if (chunked)
            {
                s = new ChunkedStream(s);
            }
            if (compressed && chunked)
            {
                s = new OnDisposeConsumeStream(s);
            }

            if (gzipped)
            {
                s = new GZipStream(s, CompressionMode.Decompress);
            }
            else if (brotli)
            {
                s = new BrotliStream(s, CompressionMode.Decompress);
            }

            return(s);
        }
Ejemplo n.º 7
0
 private static long ReadWarcRecordContentLength(Utf8String warcHeader)
 {
     return(Utf8Utils.ParseInt64(warcHeader.CaptureBetween((Utf8String)"Content-Length:", (Utf8String)"\n").Trim()));
 }