private static void InitScraperDefaults(WebsiteScraper scraper) { scraper.CreateThreadProgressDelegate = () => Program.CreateSimpleConsoleProgress("Crawler thread", true); scraper.CreateMainProgressDelegate = () => Program.CreateSimpleConsoleProgress("Crawler"); Console.CancelKeyPress += (s, e) => { scraper.Dispose(); }; scraper.Parallelism = 10; scraper.DatabaseSaveInterval = TimeSpan.FromMinutes(1); scraper.OutputAsWarc = true; scraper.DestinationBaseDirectory = Configuration_DestinationBaseDirectory; }
public static async Task MainAsync() { Shaman.Runtime.Tld.GetTldRulesCallback = () => File.ReadAllText(ConfigurationManager.CombineRepositoryOrEntrypointPath("Awdee2.Declarative/effective_tld_names.dat")); HtmlDocument.CustomPageUrlTypeConverter = x => ((LazyUri)x).Url; if (Configuration_MakeCdx != null) { WarcCdxItemRaw.GenerateCdx(string.IsNullOrEmpty(Configuration_MakeCdx) ? Directory.GetCurrentDirectory() : Configuration_MakeCdx); return; } var cookies = (Configuration_Cookies ?? (Configuration_CookieFile != null ? File.ReadAllText(Configuration_CookieFile) : null))?.Trim().TrimStart("Cookie:").Trim(); var allDynamicParameters = typeof(Program) .GetTypeInfo() .Assembly .GetTypes() .Where(x => typeof(WebsiteScraper).IsAssignableFrom(x)) .SelectMany(x => ((IEnumerable <MemberInfo>)x.GetFields(BindingFlags.Public | BindingFlags.Instance | BindingFlags.DeclaredOnly).Where(y => IsMemberTypeOkForDynamicParameter(y.FieldType))).Concat(x.GetProperties(BindingFlags.Public | BindingFlags.Instance | BindingFlags.DeclaredOnly).Where(y => IsMemberTypeOkForDynamicParameter(y.PropertyType)))); var dynamicParameters = new Dictionary <string, MemberInfo>(); foreach (var item in allDynamicParameters) { var name = "--" + Dasherize((item.DeclaringType == typeof(WebsiteScraper) ? "Site" : item.DeclaringType.Name.TrimEnd("Scraper"))) + "-" + Dasherize(item.Name); dynamicParameters.Add(name, item); } //File.WriteAllLines(@"c:\temp\-scraping-extra-parameters.txt", dynamicParameters.Keys.OrderBy(x => x), Encoding.UTF8); var positional = Environment.GetCommandLineArgs(); //Caching.EnableWebCache("/Awdee/Cache/DomEmulation"); WebsiteScraper commandLineScraper = null; if (Configuration_SiteUrl != null) { commandLineScraper = new WebsiteScraper(); Program.InitScraperDefaults(commandLineScraper); if (Configuration_SiteRules != null) { commandLineScraper.Rules = Configuration_SiteRules; } commandLineScraper.Cookies = cookies; } for (var i = 0; i < positional.Length; i++) { if (dynamicParameters.TryGetValue(positional[i], out var member)) { var t = member.DeclaringType; if (commandLineScraper == null || !(t.IsAssignableFrom(commandLineScraper.GetType()))) { commandLineScraper = (WebsiteScraper)Activator.CreateInstance(t); Program.InitScraperDefaults(commandLineScraper); commandLineScraper.Cookies = cookies; } var next = i + 1 < positional.Length ? positional[i + 1] : null; string val = string.Empty; var hasValue = false; if (next != null && !next.StartsWith("--")) { val = next; i++; hasValue = true; } object v = null; var mt = (member as PropertyInfo)?.PropertyType ?? ((FieldInfo)member).FieldType; mt = Nullable.GetUnderlyingType(mt) ?? mt; if (mt == typeof(bool)) { if (val.In(string.Empty, "1", "y", "yes", "true")) { v = true; } else if (val.In("0", "n", "no", "false")) { v = false; } else { throw new Exception("Cannot parse boolean: " + val); } } else { if (!hasValue) { throw new Exception("Missing value for parameter " + positional[i]); } if (mt == typeof(string)) { v = val; } else if (mt == typeof(string[])) { v = val.SplitFast(',', StringSplitOptions.RemoveEmptyEntries); } else if (mt == typeof(double) || mt == typeof(float)) { v = Convert.ChangeType(double.Parse(val), mt); } else { v = Convert.ChangeType(decimal.Parse(val), mt); } } if (member is PropertyInfo p) { p.SetValue(commandLineScraper, v); } else { ((FieldInfo)member).SetValue(commandLineScraper, v); } } } if (commandLineScraper != null) { using (commandLineScraper) { //commandLineScraper.DatabaseSaveInterval = TimeSpan.FromMinutes(10); if (Configuration_Destination != null) { if (Configuration_Destination.Contains("/") || Configuration_Destination.Contains("\\")) { commandLineScraper.DestinationDirectory = Path.GetFullPath(Configuration_Destination); } else { commandLineScraper.DestinationSuggestedName = Configuration_Destination; } } if (Configuration_SiteUrl != null) { commandLineScraper.AddToCrawl(Configuration_SiteUrl.AsUri()); } commandLineScraper.PerformInitialization(); if (commandLineScraper is FacebookScraper) { var postsCsv = Path.Combine(commandLineScraper.DestinationDirectory, "Posts.csv"); if ( !Configuration_RetryFailed && !Configuration_ReconsiderAll && !Configuration_FacebookUpdate && File.Exists(postsCsv)) { return; } if (Configuration_FacebookUpdate) { var candidates = WarcItem.ReadIndex(Path.Combine(commandLineScraper.DestinationDirectory, "index.cdx")); /*string user = null; * first.OpenStream((key, value) => { * if (key == "Cookie") user = value.TryCaptureBetween((Utf8String)"c_user="******";")?.ToString(); * }).Dispose();*/ // the id might also be in the redirected page var user = candidates.Take(2).Select(x => x.ReadText().TryCaptureBetween("USER_ID\":\"", "\"")).FirstOrDefault(x => x != null); string filename = user == "100009411610822" ? "facebookcookiesanisea.txt" : user == "1485885333" ? "facebookcookies.txt" : user == "100021796298990" ? "facebookcookiesgloria.txt" : user == null || user == "0" ? (string)null : throw new ArgumentException("Unknown facebook user: "******","), "yyyy-MM-dd HH:mm:ss", CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal); var fb = (FacebookScraper)commandLineScraper; fb.UpdateUpTo = stopAt; if (File.Exists(Path.Combine(commandLineScraper.DestinationDirectory, "fbgroup-not-a-member"))) { return; } if (File.Exists(Path.Combine(commandLineScraper.DestinationDirectory, "fberror"))) { return; } fb.SetStatus(fb.Root, UrlStatus.ToCrawl); if (long.TryParse(fb.Page, out var id)) { var url = "https://www.facebook.com/profile.php?id=" + id; if (fb.GetStatus(url) != UrlStatus.UnknownUrl) { fb.SetStatus(url.AsUri(), UrlStatus.ToCrawl); } } if (filename != null) { filename = Path.Combine(@"C:\Users\Andrea\OneDrive\QwawaDesktop\Scraping", filename); commandLineScraper.Cookies = File.ReadAllText(filename).Trim().TrimStart("Cookie:").Trim(); } } } if (Configuration_ReconsiderAll) { commandLineScraper.ReconsiderForScraping("**"); } else { commandLineScraper.ReconsiderSkippedUrls(); } if (Configuration_RetryFailed) { commandLineScraper.ReconsiderFailedUrls(); } /* * foreach (var line in File.ReadAllLines(@"C:\Users\Andrea\Desktop\arcilesbica-robots.txt")) * { * var l = line.Trim(); * if (l.Length == 0) continue; * commandLineScraper.AddToCrawl("http://www.arcilesbica.it"+l); * } */ if (!Configuration_FacebookMakeCsv) { await commandLineScraper.ScrapeAsync(); } if (commandLineScraper is FacebookScraper f) { //f.AddImagesToCrawl(); //f.ReconsiderSkippedUrls(); //await f.ScrapeAsync(); Console.WriteLine("Creating post list"); f.CreatePostList(); } } return; } #if SCRAPING_SANDBOX using (var scraper = new WebsiteScraper()) { InitScraperDefaults(scraper); /* * scraper.DestinationSuggestedName = "fb-anisea-friends"; * scraper.Cookies = File.ReadAllText(@"C:\Repositories\Awdee\Awdee2.Declarative\phantomjs\facebookcookiesanisea.txt").Trim(); * scraper.ShouldScrape = (url, prereq) => * { * if (prereq) return true; * return false; * }; * scraper.ForceLinks = "a:link-has-host-path('m.facebook.com'; '/friends/center/friends/')"; * scraper.AddToCrawl("https://m.facebook.com/friends/center/friends/", true); */ /* * var friends = scraper.CdxIndex.Values.Where(x => x.Url.Contains("/friends/center/friends/")) * .SelectMany(x => * { * var zz = x.ReadHtml(); * return zz.FindAll("div.w.bk").Select(z=> { * var m = z.TryGetValue("div.bn.bo", pattern: @"^([,\d]+) mutual"); * return new FacebookFriend * { * Name = z.TryGetValue("img", "alt"), * MutualFriends = m != null ? int.Parse(m.Replace(",", string.Empty)) : -1, * Id = long.Parse(z.GetLinkUrl("a").GetQueryParameter("uid")) * }; * }); * }).ToList(); * JsonFile.Save(friends, Path.Combine(scraper.DestinationDirectory, "friends.json")); */ /* * var friends = JsonFile.Read<List<FacebookFriend>>(Path.Combine(scraper.DestinationDirectory, "friends.json")) * .OrderByDescending(x => x.MutualFriends); */ //friends.OpenExcel(); /* * await friends.ForEachThrottledAsync(async x => * { * Console.WriteLine(x.Name); * var p = await scraper.GetHtmlNodeAsync(("https://m.facebook.com/" + x.Id).AsUri()); * }, Debugger.IsAttached ? 1 : scraper.Parallelism); */ /* * foreach (var item in scraper.CdxIndex.Values.Where(x=>x.Url.EndsWith("_rdr"))) * { * Console.WriteLine(item.Url); * scraper.CrawlLinks(item.ReadHtml()); * } * scraper.SaveDatabase(); * await scraper.ScrapeAsync(); */ /* * var actuallyExisting = new HashSet<string>(scraper.CdxIndex.Keys); * * foreach (var item in scraper.GetScrapedUrls(false)) * { * if (!actuallyExisting.Contains(item.AbsoluteUri)) * { * scraper.SetStatus(item, UrlStatus.ToCrawl); * } * } * return;*/ var resourcesOnly = true; scraper.AddToCrawl("http://knowyourmeme.com/"); if (!resourcesOnly) { scraper.Parallelism = 2; scraper.InterRequestDelay = TimeSpan.FromSeconds(0.4); } scraper.HtmlReceived += (a, b, page) => { if (page.FindSingle("h3:contains('include your ip address then there')") != null) { scraper.Dispose(); throw new Exception("IP banned."); } }; scraper.RewriteLink = (url) => { if (url.Query == "?fb") { return(url.GetLeftPart(UriPartial.Path).AsUri()); } return(url); }; scraper.ShouldScrape = (url, prereq) => { if (url.Contains("kym") && url.EndsWith(".jpg")) { } if (resourcesOnly && url.IsHostedOn("knowyourmeme.com")) { return(false); } if (!resourcesOnly && !url.IsHostedOn("knowyourmeme.com")) { return(false); } //if (url.IsHostedOn("kym-cdn.com")) return false; // todo if (url.IsHostedOn("imgur.com")) { return(false); // todo } if (url.IsHostedOn("meme.am")) { return(false); } //if (!url.IsHostedOn("kym-cdn.com")) return false; if (prereq) { return(true); } if (!scraper.IsSubfolderOfFirstUrl(url)) { return(false); } if (url.HasQueryParameters()) { return(false); } if (url.AbsolutePath == "/") { return(true); } if (url.PathEndsWith("/photos/page/1")) { return(false); } if (url.PathEndsWith("/videos/page/1")) { return(false); } if (url.PathEndsWith("/editorships")) { return(false); } if (url.PathEndsWith("/ask")) { return(false); } if (url.PathContainsComponent("/photos/trending")) { return(false); } if (url.PathContainsComponent("/videos/trending")) { return(false); } //if (url.PathStartsWith("/page/")) return true; if (url.PathContainsComponent("/memes/popular/")) { return(false); } if (url.PathContainsComponent("/deadpool/")) { return(false); } if (url.PathContainsComponent("/memes/researching/page/")) { return(false); } if (url.PathContainsComponent("/edits/")) { return(false); } if (url.PathContainsComponent("/sort/")) { return(false); } if (url.PathContainsComponent("/favorites/")) { return(false); } if (url.PathContainsComponent("/new/")) { return(false); } if (url.PathStartsWith("/memes/")) { return(true); } //if (url.PathContains("/photos/")) return false; //if (url.PathEndsWith("/children")) return false; return(false); }; scraper.ReconsiderForScraping("**"); //await scraper.ScrapeFailedImagesFromKnownHostsAsync(); await scraper.ScrapeAsync(); } #else Console.WriteLine("Bad command line."); #endif }