Пример #1
0
        public WarcItem ToWarcItem(string folder, ref byte[] fileNameBytes, ref string fileNameString)
        {
            string fn;

            if (fileNameBytes != null && fileNameBytes.Length == this.FileName.Length && this.FileName.Bytes.BlockEquals((ReadOnlySpan <byte>)fileNameBytes.Slice()))
            {
                fn = fileNameString;
            }
            else
            {
                fileNameBytes = new byte[this.FileName.Length];
                this.FileName.Bytes.CopyTo(fileNameBytes);
                fileNameString = Path.Combine(folder, this.FileName.ToString());
                fn             = fileNameString;
            }

            return(new WarcItem()
            {
                Url = this.OriginalUrl.ToString(),
                CompressedOffset = Utf8Utils.ParseInt64(this.CompressedArcFileOffset),
                CompressedLength = Utf8Utils.ParseInt64(this.CompressedRecordSize),
                Date = ParseDate(this.Date),
                PayloadLength = this.PayloadLength.Length != 0 ? Utf8Utils.ParseInt64(this.PayloadLength) : -1,
                WarcFile = fn,
                LastModified = this.LastModified.Length > 1 ? ParseDate(this.LastModified) : (DateTime?)null,
                ContentType = this.MimeTypeOfOriginalDocument.ToStringCached(),
                ResponseCode = this.ResponseCode.Length != 0 ? (HttpStatusCode)Utf8Utils.ParseInt32(this.ResponseCode) : default(HttpStatusCode),
            });
        }
Пример #2
0
        internal static DateTime ParseHttpDate(Utf8String str)
        {
            Utf8Utils.ReadTo(ref str, (byte)' ');
            var day   = Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)' '));
            var month = ParseMonth(Utf8Utils.ReadTo(ref str, (byte)' '));
            var year  = Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)' '));

            //str.Split((byte)' ', StringSplitOptions.None, ref arr);
            return(new DateTime(
                       year,
                       month,
                       day,
                       Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)':')),
                       Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)':')),
                       Utf8Utils.ParseInt32(Utf8Utils.ReadTo(ref str, (byte)' ')),
                       DateTimeKind.Utc
                       ));
        }
Пример #3
0
        public WarcFs(string cdx)
        {
            this.cdx = cdx;
            byte[] fileNameBytes  = null;
            string fileNameString = null;
            var    folder         = Path.GetDirectoryName(cdx);

            this.Root = CreateTree <WarcItem>(WarcCdxItemRaw.Read(cdx).Select(x =>
            {
                var response = x.ResponseCode;
                if (response.Length != 0)
                {
                    var responseCode = Utf8Utils.ParseInt32(response);
                    if (responseCode < 200 || responseCode >= 300)
                    {
                        return(null);
                    }
                }
                return(x.ToWarcItem(folder, ref fileNameBytes, ref fileNameString));
            }).Where(x => x != null), x =>
            {
                var url = new Uri(x.Url);

                var keep = -1;
                if (url.AbsolutePath.StartsWith("/w/images/"))
                {
                    keep = 2;
                }
                else if (url.AbsolutePath.StartsWith("/wiki/"))
                {
                    keep = 1;
                }
                else if (url.Host.EndsWith(".fbcdn.net"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".media.tumblr.com"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".bp.blogspot.com"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".reddit.com") && url.AbsolutePath.Contains("/comments/"))
                {
                    keep = 3;
                }
                else if (url.Host.EndsWith(".staticflickr.com"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".giphy.com") && url.Host.Contains("media"))
                {
                    keep = 0;
                }
                var path = WebsiteScraper.GetPathInternal(null, url, x.ContentType, keep);
                path     = path.Replace('/', '\\');

                if (path.Length > 150)
                {
                    var z = path.IndexOf('‽');
                    if (z != -1)
                    {
                        path = path.Substring(0, z) + "‽{" + Math.Abs((long)path.GetHashCode()) + "}" + Path.GetExtension(path);
                    }
                }

                if (url.IsHostedOn("facebook.com") && url.AbsolutePath.StartsWith("/pages_reaction_units/"))
                {
                    path  = path.TrimEnd(".js");
                    path += ".html";
                }

                return(path);
            }, null, x =>
            {
                x.Tag = TagVirtual;
                if (x.Info != null)
                {
                    urlToFsNode[x.Info.Url] = x;
                }
            });

            FsNode <WarcItem> rawRoot = null;

            rawRoot = new FsNode <WarcItem>()
            {
                Name = "_raw", GetChildrenDelegate = CreateGetChildrenDelegate(this.Root)
            };
            Func <List <FsNode <WarcItem> > > CreateGetChildrenDelegate(FsNode <WarcItem> reference)
            {
                if (reference.Children == null)
                {
                    return(() => null);
                }
                return(new Func <List <FsNode <WarcItem> > >(() =>
                {
                    return reference.Children.Where(x => x != rawRoot).Select(x =>
                    {
                        var k = new FsNode <WarcItem>()
                        {
                            Info = x.Info,
                            Name = x.Name,
                            GetChildrenDelegate = CreateGetChildrenDelegate(x),
                            Tag = null,
                            FullName = x.FullName != null ? "_raw\\" + x.FullName : null
                        };
                        return k;
                    }).ToList();
                }));
            }

            this.Root.Children.Add(rawRoot);


            cache = new MemoryStreamCache <FsNode <WarcItem> >((item, dest) =>
            {
                if (item.Tag == TagVirtual)
                {
                    var ct = item.Info.ContentType;
                    if (ct != null && ct.Contains("/html") || item.Info.Url.Contains("facebook.com/pages_reaction_units/"))
                    {
                        HtmlNode doc;
                        var pagePath = item.FullName;
                        if (item.Info.Url.Contains("/pages_reaction_units/"))
                        {
                            var jsontext = item.Info.ReadText();
                            var idx      = jsontext.IndexOf('{');
                            var json     = (JObject)HttpUtils.ReadJsonToken(jsontext, idx);
                            doc          = new HtmlDocument("<!doctype html><html><head><meta charset=\"utf-8\"></head><body></body></html>").DocumentNode;
                            doc.OwnerDocument.SetPageUrl(item.Info.Url.AsUri());
                            var body = doc.Descendants("body").First();

                            foreach (var domop in (JArray)json["domops"])
                            {
                                var html = ((JArray)domop).First(x => x is JObject)["__html"].Value <string>();
                                body.AppendChild(html.AsHtmlNode());
                            }
                        }
                        else
                        {
                            doc = item.Info.ReadHtml();
                        }
                        ProcessHtml(ref doc, pagePath);
                        var simpleStyle = doc.OwnerDocument.CreateElement("link");
                        simpleStyle.SetAttributeValue("rel", "stylesheet");
                        simpleStyle.SetAttributeValue("href", @"file:///C:\Users\Andrea\Desktop\facebook-simple-css.css");
                        (doc.FindSingle("head") ?? doc).AppendChild(simpleStyle);
                        using (var sw = new StreamWriter(dest, Encoding.UTF8, 16 * 1024, true))
                        {
                            doc.WriteTo(sw);
                        }
                        return;
                    }
                }

                using (var k = item.Info.OpenStream())
                {
                    k.CopyTo(dest);
                }
            });
        }