Пример #1
0
        static void Main(string[] args)
        {
            var localizedHomepages = new Dictionary <CultureInfo, Uri>();

            // put your URLs and content Culture here - current example is importing two languages
            localizedHomepages.Add(CultureInfo.GetCultureInfo("en-US"), new Uri("http://www.denmarkvac.cn/index.html"));
            localizedHomepages.Add(CultureInfo.GetCultureInfo("zh-CN"), new Uri("http://www.denmarkvac.cn/chinese/index.html"));

            // write path to a temp c1 site here - will copy data/media to this, for immediate test.
            string pathToTestWebsite = @"C:\Users\marcus.wendt\Documents\My Web Sites\CompositeC19";

            // declare your providers here - the sample ones will probably not work out of the box, so next step if to make your own
            IContentParser   contentParser   = new CustomProviders.Samples.ContentParser();
            ITemplateChooser templateChooser = new CustomProviders.Samples.TemplateChooser();

            // and off we go ...
            var scraper      = new WebsiteScraper(contentParser);
            var scrapeResult = scraper.Scrape(localizedHomepages);
            var rewriter     = new UriRewriter(scrapeResult);

            rewriter.MakePathsInternal();
            DataSerializer serializer = new DataSerializer(templateChooser);

            serializer.WriteToXmlFiles(scrapeResult);

            CopyToTestWebsite(pathToTestWebsite);

            Console.Write("All Done...");
        }
Пример #2
0
        public void finishedAPull(WebsiteScraper inputScraper)
        {
            Console.WriteLine("--Finished pull of " + inputScraper.getName() + "--");

            currentScraperIndex++;
            startNextScraper();
        }
Пример #3
0
        private string MakeRelativeFsUrl(Uri baseUrl, string value, string pagePath)
        {
            if (string.IsNullOrEmpty(value))
            {
                return(value);
            }
            if (value.StartsWith("#"))
            {
                return(value);
            }
            if (value.Trim().StartsWith("javascript:"))
            {
                return("javascript:void(0)");
            }
            if (value.StartsWith("//"))
            {
                value = "http:" + value;
            }
            var z = value.IndexOf(':');
            var q = value.IndexOf('?');
            var s = value.IndexOf('/');

            if (q == -1)
            {
                q = int.MaxValue;
            }
            if (s == -1)
            {
                s = int.MaxValue;
            }
            if (z == -1)
            {
                z = int.MaxValue;
            }
            var min = Math.Min(z, Math.Min(q, s));

            if (z != int.MaxValue && z == min)
            {
                if (!value.StartsWith("http://") && !value.StartsWith("https://"))
                {
                    return(value);
                }
            }
            var abs = new Uri(baseUrl, value);

            if (!urlToFsNode.TryGetValue(abs.AbsoluteUri, out var fsnode))
            {
                if (abs.IsHostedOnAndPathStartsWith("facebook.com", "l.php"))
                {
                    return(abs.GetQueryParameter("u"));
                }
                return(abs.AbsoluteUri);
            }

            var rel = WebsiteScraper.GetRelativePath("Z:\\" + pagePath, "Z:\\" + fsnode.FullName);

            return(rel);
        }
Пример #4
0
 private static void SetupScraper()
 {
     Scraper = new WebsiteScraper();
     Scraper.PerformInitialization();
     Directory.CreateDirectory("WARC");
     Scraper.CreateThreadProgressDelegate = () => Program.CreateSimpleConsoleProgress("Crawler thread", true);
     Scraper.CreateMainProgressDelegate   = () => Program.CreateSimpleConsoleProgress("Crawler");
     Console.CancelKeyPress += (s, e) =>
     {
         Scraper.Dispose();
         t.Dispose();
     };
     Scraper.OutputAsWarc         = true;
     Scraper.DestinationDirectory = Path.GetFullPath("WARC");
     Scraper.DatabaseSaveInterval = TimeSpan.FromMinutes(1);
     Scraper.ShouldScrape         = (url, prereq) =>
     {
         var stringUrl = url.ToString();
         if (stringUrl.Contains(".css") || stringUrl.Contains(".js"))
         {
             return(true);
         }
         if (stringUrl.Contains("lang="))
         {
             return(false);
         }
         if (stringUrl.Contains("mobile.twitter.com") || stringUrl.Contains("publish.twitter.com"))
         {
             return(false);
         }
         if (url.IsHostedOn("twimg.com"))
         {
             return(true);
         }
         if (stringUrl.Contains("/status/"))
         {
             return(true);
         }
         if (prereq)
         {
             return(true);
         }
         return(false);
     };
     //Scraper.ReconsiderSkippedUrls();
 }
Пример #5
0
        public WarcFs(string cdx)
        {
            this.cdx = cdx;
            byte[] fileNameBytes  = null;
            string fileNameString = null;
            var    folder         = Path.GetDirectoryName(cdx);

            this.Root = CreateTree <WarcItem>(WarcCdxItemRaw.Read(cdx).Select(x =>
            {
                var response = x.ResponseCode;
                if (response.Length != 0)
                {
                    var responseCode = Utf8Utils.ParseInt32(response);
                    if (responseCode < 200 || responseCode >= 300)
                    {
                        return(null);
                    }
                }
                return(x.ToWarcItem(folder, ref fileNameBytes, ref fileNameString));
            }).Where(x => x != null), x =>
            {
                var url = new Uri(x.Url);

                var keep = -1;
                if (url.AbsolutePath.StartsWith("/w/images/"))
                {
                    keep = 2;
                }
                else if (url.AbsolutePath.StartsWith("/wiki/"))
                {
                    keep = 1;
                }
                else if (url.Host.EndsWith(".fbcdn.net"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".media.tumblr.com"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".bp.blogspot.com"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".reddit.com") && url.AbsolutePath.Contains("/comments/"))
                {
                    keep = 3;
                }
                else if (url.Host.EndsWith(".staticflickr.com"))
                {
                    keep = 0;
                }
                else if (url.Host.EndsWith(".giphy.com") && url.Host.Contains("media"))
                {
                    keep = 0;
                }
                var path = WebsiteScraper.GetPathInternal(null, url, x.ContentType, keep);
                path     = path.Replace('/', '\\');

                if (path.Length > 150)
                {
                    var z = path.IndexOf('‽');
                    if (z != -1)
                    {
                        path = path.Substring(0, z) + "‽{" + Math.Abs((long)path.GetHashCode()) + "}" + Path.GetExtension(path);
                    }
                }

                if (url.IsHostedOn("facebook.com") && url.AbsolutePath.StartsWith("/pages_reaction_units/"))
                {
                    path  = path.TrimEnd(".js");
                    path += ".html";
                }

                return(path);
            }, null, x =>
            {
                x.Tag = TagVirtual;
                if (x.Info != null)
                {
                    urlToFsNode[x.Info.Url] = x;
                }
            });

            FsNode <WarcItem> rawRoot = null;

            rawRoot = new FsNode <WarcItem>()
            {
                Name = "_raw", GetChildrenDelegate = CreateGetChildrenDelegate(this.Root)
            };
            Func <List <FsNode <WarcItem> > > CreateGetChildrenDelegate(FsNode <WarcItem> reference)
            {
                if (reference.Children == null)
                {
                    return(() => null);
                }
                return(new Func <List <FsNode <WarcItem> > >(() =>
                {
                    return reference.Children.Where(x => x != rawRoot).Select(x =>
                    {
                        var k = new FsNode <WarcItem>()
                        {
                            Info = x.Info,
                            Name = x.Name,
                            GetChildrenDelegate = CreateGetChildrenDelegate(x),
                            Tag = null,
                            FullName = x.FullName != null ? "_raw\\" + x.FullName : null
                        };
                        return k;
                    }).ToList();
                }));
            }

            this.Root.Children.Add(rawRoot);


            cache = new MemoryStreamCache <FsNode <WarcItem> >((item, dest) =>
            {
                if (item.Tag == TagVirtual)
                {
                    var ct = item.Info.ContentType;
                    if (ct != null && ct.Contains("/html") || item.Info.Url.Contains("facebook.com/pages_reaction_units/"))
                    {
                        HtmlNode doc;
                        var pagePath = item.FullName;
                        if (item.Info.Url.Contains("/pages_reaction_units/"))
                        {
                            var jsontext = item.Info.ReadText();
                            var idx      = jsontext.IndexOf('{');
                            var json     = (JObject)HttpUtils.ReadJsonToken(jsontext, idx);
                            doc          = new HtmlDocument("<!doctype html><html><head><meta charset=\"utf-8\"></head><body></body></html>").DocumentNode;
                            doc.OwnerDocument.SetPageUrl(item.Info.Url.AsUri());
                            var body = doc.Descendants("body").First();

                            foreach (var domop in (JArray)json["domops"])
                            {
                                var html = ((JArray)domop).First(x => x is JObject)["__html"].Value <string>();
                                body.AppendChild(html.AsHtmlNode());
                            }
                        }
                        else
                        {
                            doc = item.Info.ReadHtml();
                        }
                        ProcessHtml(ref doc, pagePath);
                        var simpleStyle = doc.OwnerDocument.CreateElement("link");
                        simpleStyle.SetAttributeValue("rel", "stylesheet");
                        simpleStyle.SetAttributeValue("href", @"file:///C:\Users\Andrea\Desktop\facebook-simple-css.css");
                        (doc.FindSingle("head") ?? doc).AppendChild(simpleStyle);
                        using (var sw = new StreamWriter(dest, Encoding.UTF8, 16 * 1024, true))
                        {
                            doc.WriteTo(sw);
                        }
                        return;
                    }
                }

                using (var k = item.Info.OpenStream())
                {
                    k.CopyTo(dest);
                }
            });
        }
Пример #6
0
        private async Task <HttpResponseMessage> SendAsyncInternal(HttpRequestMessage request, CancellationToken cancellationToken)
        {
            if (disposed)
            {
                throw new ObjectDisposedException(nameof(CurlWarcHandler));
            }
            if (request.Properties.TryGetValue("ShamanURL", out var shamanUrlObj) && shamanUrlObj is LazyUri shamanUrl)
            {
                shamanUrl.RemoveFragmentParameter("$assume-text");
                request.Properties["ShamanURL"] = shamanUrl;
            }
            if (TryGetCached != null)
            {
                var cached = TryGetCached(request);
                if (cached != null)
                {
                    return(cached);
                }
                else
                {
                }
            }

            CurlEasy     easy       = null;
            MemoryStream requestMs  = null;
            MemoryStream responseMs = null;

            lock (lockObj)
            {
                easy       = BorrowPooled(pooledEasyHandles);
                requestMs  = BorrowPooled(pooledRequestMemoryStreams);
                responseMs = BorrowPooled(pooledResponseMemoryStreams);
            }
            Sanity.Assert(requestMs != null);

            var response = new HttpResponseMessage();



            var(httpCode, curlCode, warcItem) = await WebsiteScraper.ScrapeAsync(easy, request, request.RequestUri.AbsoluteUri, requestMs, responseMs, ea =>
            {
                return(GetDestinationWarc(request.RequestUri, easy, requestMs, responseMs));
            }, syncObj, cancellationToken);

            if (curlCode != CurlCode.Ok)
            {
                Release(easy, requestMs, responseMs);
                throw new WebException("Curl: " + curlCode, (WebExceptionStatus)(800 + curlCode));
            }

            responseMs.Seek(0, SeekOrigin.Begin);
            var httpResponse = new Utf8StreamReader(responseMs);

            response.RequestMessage = request;
            response.StatusCode     = httpCode;

            using (var scratchpad = new Scratchpad())
            {
                var stream = WarcItem.OpenHttp(httpResponse, scratchpad, request.RequestUri, responseMs.Length, out long payloadLength, out var _, out var _, out var contentType, out var _, (key, val) =>
                {
                    response.Headers.TryAddWithoutValidation(key.ToString(), val.ToString());
                });
                response.Content = new System.Net.Http.StreamContent(new DisposeCallbackStream(stream, () =>
                {
                    Release(easy, requestMs, responseMs);
                }));
            }
            OnResponseReceived?.Invoke(response, easy, requestMs, responseMs);
            return(response);
        }