static void Main(string[] args) { var localizedHomepages = new Dictionary <CultureInfo, Uri>(); // put your URLs and content Culture here - current example is importing two languages localizedHomepages.Add(CultureInfo.GetCultureInfo("en-US"), new Uri("http://www.denmarkvac.cn/index.html")); localizedHomepages.Add(CultureInfo.GetCultureInfo("zh-CN"), new Uri("http://www.denmarkvac.cn/chinese/index.html")); // write path to a temp c1 site here - will copy data/media to this, for immediate test. string pathToTestWebsite = @"C:\Users\marcus.wendt\Documents\My Web Sites\CompositeC19"; // declare your providers here - the sample ones will probably not work out of the box, so next step if to make your own IContentParser contentParser = new CustomProviders.Samples.ContentParser(); ITemplateChooser templateChooser = new CustomProviders.Samples.TemplateChooser(); // and off we go ... var scraper = new WebsiteScraper(contentParser); var scrapeResult = scraper.Scrape(localizedHomepages); var rewriter = new UriRewriter(scrapeResult); rewriter.MakePathsInternal(); DataSerializer serializer = new DataSerializer(templateChooser); serializer.WriteToXmlFiles(scrapeResult); CopyToTestWebsite(pathToTestWebsite); Console.Write("All Done..."); }
public void finishedAPull(WebsiteScraper inputScraper) { Console.WriteLine("--Finished pull of " + inputScraper.getName() + "--"); currentScraperIndex++; startNextScraper(); }
private string MakeRelativeFsUrl(Uri baseUrl, string value, string pagePath) { if (string.IsNullOrEmpty(value)) { return(value); } if (value.StartsWith("#")) { return(value); } if (value.Trim().StartsWith("javascript:")) { return("javascript:void(0)"); } if (value.StartsWith("//")) { value = "http:" + value; } var z = value.IndexOf(':'); var q = value.IndexOf('?'); var s = value.IndexOf('/'); if (q == -1) { q = int.MaxValue; } if (s == -1) { s = int.MaxValue; } if (z == -1) { z = int.MaxValue; } var min = Math.Min(z, Math.Min(q, s)); if (z != int.MaxValue && z == min) { if (!value.StartsWith("http://") && !value.StartsWith("https://")) { return(value); } } var abs = new Uri(baseUrl, value); if (!urlToFsNode.TryGetValue(abs.AbsoluteUri, out var fsnode)) { if (abs.IsHostedOnAndPathStartsWith("facebook.com", "l.php")) { return(abs.GetQueryParameter("u")); } return(abs.AbsoluteUri); } var rel = WebsiteScraper.GetRelativePath("Z:\\" + pagePath, "Z:\\" + fsnode.FullName); return(rel); }
private static void SetupScraper() { Scraper = new WebsiteScraper(); Scraper.PerformInitialization(); Directory.CreateDirectory("WARC"); Scraper.CreateThreadProgressDelegate = () => Program.CreateSimpleConsoleProgress("Crawler thread", true); Scraper.CreateMainProgressDelegate = () => Program.CreateSimpleConsoleProgress("Crawler"); Console.CancelKeyPress += (s, e) => { Scraper.Dispose(); t.Dispose(); }; Scraper.OutputAsWarc = true; Scraper.DestinationDirectory = Path.GetFullPath("WARC"); Scraper.DatabaseSaveInterval = TimeSpan.FromMinutes(1); Scraper.ShouldScrape = (url, prereq) => { var stringUrl = url.ToString(); if (stringUrl.Contains(".css") || stringUrl.Contains(".js")) { return(true); } if (stringUrl.Contains("lang=")) { return(false); } if (stringUrl.Contains("mobile.twitter.com") || stringUrl.Contains("publish.twitter.com")) { return(false); } if (url.IsHostedOn("twimg.com")) { return(true); } if (stringUrl.Contains("/status/")) { return(true); } if (prereq) { return(true); } return(false); }; //Scraper.ReconsiderSkippedUrls(); }
public WarcFs(string cdx) { this.cdx = cdx; byte[] fileNameBytes = null; string fileNameString = null; var folder = Path.GetDirectoryName(cdx); this.Root = CreateTree <WarcItem>(WarcCdxItemRaw.Read(cdx).Select(x => { var response = x.ResponseCode; if (response.Length != 0) { var responseCode = Utf8Utils.ParseInt32(response); if (responseCode < 200 || responseCode >= 300) { return(null); } } return(x.ToWarcItem(folder, ref fileNameBytes, ref fileNameString)); }).Where(x => x != null), x => { var url = new Uri(x.Url); var keep = -1; if (url.AbsolutePath.StartsWith("/w/images/")) { keep = 2; } else if (url.AbsolutePath.StartsWith("/wiki/")) { keep = 1; } else if (url.Host.EndsWith(".fbcdn.net")) { keep = 0; } else if (url.Host.EndsWith(".media.tumblr.com")) { keep = 0; } else if (url.Host.EndsWith(".bp.blogspot.com")) { keep = 0; } else if (url.Host.EndsWith(".reddit.com") && url.AbsolutePath.Contains("/comments/")) { keep = 3; } else if (url.Host.EndsWith(".staticflickr.com")) { keep = 0; } else if (url.Host.EndsWith(".giphy.com") && url.Host.Contains("media")) { keep = 0; } var path = WebsiteScraper.GetPathInternal(null, url, x.ContentType, keep); path = path.Replace('/', '\\'); if (path.Length > 150) { var z = path.IndexOf('‽'); if (z != -1) { path = path.Substring(0, z) + "‽{" + Math.Abs((long)path.GetHashCode()) + "}" + Path.GetExtension(path); } } if (url.IsHostedOn("facebook.com") && url.AbsolutePath.StartsWith("/pages_reaction_units/")) { path = path.TrimEnd(".js"); path += ".html"; } return(path); }, null, x => { x.Tag = TagVirtual; if (x.Info != null) { urlToFsNode[x.Info.Url] = x; } }); FsNode <WarcItem> rawRoot = null; rawRoot = new FsNode <WarcItem>() { Name = "_raw", GetChildrenDelegate = CreateGetChildrenDelegate(this.Root) }; Func <List <FsNode <WarcItem> > > CreateGetChildrenDelegate(FsNode <WarcItem> reference) { if (reference.Children == null) { return(() => null); } return(new Func <List <FsNode <WarcItem> > >(() => { return reference.Children.Where(x => x != rawRoot).Select(x => { var k = new FsNode <WarcItem>() { Info = x.Info, Name = x.Name, GetChildrenDelegate = CreateGetChildrenDelegate(x), Tag = null, FullName = x.FullName != null ? "_raw\\" + x.FullName : null }; return k; }).ToList(); })); } this.Root.Children.Add(rawRoot); cache = new MemoryStreamCache <FsNode <WarcItem> >((item, dest) => { if (item.Tag == TagVirtual) { var ct = item.Info.ContentType; if (ct != null && ct.Contains("/html") || item.Info.Url.Contains("facebook.com/pages_reaction_units/")) { HtmlNode doc; var pagePath = item.FullName; if (item.Info.Url.Contains("/pages_reaction_units/")) { var jsontext = item.Info.ReadText(); var idx = jsontext.IndexOf('{'); var json = (JObject)HttpUtils.ReadJsonToken(jsontext, idx); doc = new HtmlDocument("<!doctype html><html><head><meta charset=\"utf-8\"></head><body></body></html>").DocumentNode; doc.OwnerDocument.SetPageUrl(item.Info.Url.AsUri()); var body = doc.Descendants("body").First(); foreach (var domop in (JArray)json["domops"]) { var html = ((JArray)domop).First(x => x is JObject)["__html"].Value <string>(); body.AppendChild(html.AsHtmlNode()); } } else { doc = item.Info.ReadHtml(); } ProcessHtml(ref doc, pagePath); var simpleStyle = doc.OwnerDocument.CreateElement("link"); simpleStyle.SetAttributeValue("rel", "stylesheet"); simpleStyle.SetAttributeValue("href", @"file:///C:\Users\Andrea\Desktop\facebook-simple-css.css"); (doc.FindSingle("head") ?? doc).AppendChild(simpleStyle); using (var sw = new StreamWriter(dest, Encoding.UTF8, 16 * 1024, true)) { doc.WriteTo(sw); } return; } } using (var k = item.Info.OpenStream()) { k.CopyTo(dest); } }); }
private async Task <HttpResponseMessage> SendAsyncInternal(HttpRequestMessage request, CancellationToken cancellationToken) { if (disposed) { throw new ObjectDisposedException(nameof(CurlWarcHandler)); } if (request.Properties.TryGetValue("ShamanURL", out var shamanUrlObj) && shamanUrlObj is LazyUri shamanUrl) { shamanUrl.RemoveFragmentParameter("$assume-text"); request.Properties["ShamanURL"] = shamanUrl; } if (TryGetCached != null) { var cached = TryGetCached(request); if (cached != null) { return(cached); } else { } } CurlEasy easy = null; MemoryStream requestMs = null; MemoryStream responseMs = null; lock (lockObj) { easy = BorrowPooled(pooledEasyHandles); requestMs = BorrowPooled(pooledRequestMemoryStreams); responseMs = BorrowPooled(pooledResponseMemoryStreams); } Sanity.Assert(requestMs != null); var response = new HttpResponseMessage(); var(httpCode, curlCode, warcItem) = await WebsiteScraper.ScrapeAsync(easy, request, request.RequestUri.AbsoluteUri, requestMs, responseMs, ea => { return(GetDestinationWarc(request.RequestUri, easy, requestMs, responseMs)); }, syncObj, cancellationToken); if (curlCode != CurlCode.Ok) { Release(easy, requestMs, responseMs); throw new WebException("Curl: " + curlCode, (WebExceptionStatus)(800 + curlCode)); } responseMs.Seek(0, SeekOrigin.Begin); var httpResponse = new Utf8StreamReader(responseMs); response.RequestMessage = request; response.StatusCode = httpCode; using (var scratchpad = new Scratchpad()) { var stream = WarcItem.OpenHttp(httpResponse, scratchpad, request.RequestUri, responseMs.Length, out long payloadLength, out var _, out var _, out var contentType, out var _, (key, val) => { response.Headers.TryAddWithoutValidation(key.ToString(), val.ToString()); }); response.Content = new System.Net.Http.StreamContent(new DisposeCallbackStream(stream, () => { Release(easy, requestMs, responseMs); })); } OnResponseReceived?.Invoke(response, easy, requestMs, responseMs); return(response); }