public static async Task MainAsync() { Shaman.Runtime.Tld.GetTldRulesCallback = () => File.ReadAllText(ConfigurationManager.CombineRepositoryOrEntrypointPath("Awdee2.Declarative/effective_tld_names.dat")); HtmlDocument.CustomPageUrlTypeConverter = x => ((LazyUri)x).Url; if (Configuration_MakeCdx != null) { WarcCdxItemRaw.GenerateCdx(string.IsNullOrEmpty(Configuration_MakeCdx) ? Directory.GetCurrentDirectory() : Configuration_MakeCdx); return; } var cookies = (Configuration_Cookies ?? (Configuration_CookieFile != null ? File.ReadAllText(Configuration_CookieFile) : null))?.Trim().TrimStart("Cookie:").Trim(); var allDynamicParameters = typeof(Program) .GetTypeInfo() .Assembly .GetTypes() .Where(x => typeof(WebsiteScraper).IsAssignableFrom(x)) .SelectMany(x => ((IEnumerable <MemberInfo>)x.GetFields(BindingFlags.Public | BindingFlags.Instance | BindingFlags.DeclaredOnly).Where(y => IsMemberTypeOkForDynamicParameter(y.FieldType))).Concat(x.GetProperties(BindingFlags.Public | BindingFlags.Instance | BindingFlags.DeclaredOnly).Where(y => IsMemberTypeOkForDynamicParameter(y.PropertyType)))); var dynamicParameters = new Dictionary <string, MemberInfo>(); foreach (var item in allDynamicParameters) { var name = "--" + Dasherize((item.DeclaringType == typeof(WebsiteScraper) ? "Site" : item.DeclaringType.Name.TrimEnd("Scraper"))) + "-" + Dasherize(item.Name); dynamicParameters.Add(name, item); } //File.WriteAllLines(@"c:\temp\-scraping-extra-parameters.txt", dynamicParameters.Keys.OrderBy(x => x), Encoding.UTF8); var positional = Environment.GetCommandLineArgs(); //Caching.EnableWebCache("/Awdee/Cache/DomEmulation"); WebsiteScraper commandLineScraper = null; if (Configuration_SiteUrl != null) { commandLineScraper = new WebsiteScraper(); Program.InitScraperDefaults(commandLineScraper); if (Configuration_SiteRules != null) { commandLineScraper.Rules = Configuration_SiteRules; } commandLineScraper.Cookies = cookies; } for (var i = 0; i < positional.Length; i++) { if (dynamicParameters.TryGetValue(positional[i], out var member)) { var t = member.DeclaringType; if (commandLineScraper == null || !(t.IsAssignableFrom(commandLineScraper.GetType()))) { commandLineScraper = (WebsiteScraper)Activator.CreateInstance(t); Program.InitScraperDefaults(commandLineScraper); commandLineScraper.Cookies = cookies; } var next = i + 1 < positional.Length ? positional[i + 1] : null; string val = string.Empty; var hasValue = false; if (next != null && !next.StartsWith("--")) { val = next; i++; hasValue = true; } object v = null; var mt = (member as PropertyInfo)?.PropertyType ?? ((FieldInfo)member).FieldType; mt = Nullable.GetUnderlyingType(mt) ?? mt; if (mt == typeof(bool)) { if (val.In(string.Empty, "1", "y", "yes", "true")) { v = true; } else if (val.In("0", "n", "no", "false")) { v = false; } else { throw new Exception("Cannot parse boolean: " + val); } } else { if (!hasValue) { throw new Exception("Missing value for parameter " + positional[i]); } if (mt == typeof(string)) { v = val; } else if (mt == typeof(string[])) { v = val.SplitFast(',', StringSplitOptions.RemoveEmptyEntries); } else if (mt == typeof(double) || mt == typeof(float)) { v = Convert.ChangeType(double.Parse(val), mt); } else { v = Convert.ChangeType(decimal.Parse(val), mt); } } if (member is PropertyInfo p) { p.SetValue(commandLineScraper, v); } else { ((FieldInfo)member).SetValue(commandLineScraper, v); } } } if (commandLineScraper != null) { using (commandLineScraper) { //commandLineScraper.DatabaseSaveInterval = TimeSpan.FromMinutes(10); if (Configuration_Destination != null) { if (Configuration_Destination.Contains("/") || Configuration_Destination.Contains("\\")) { commandLineScraper.DestinationDirectory = Path.GetFullPath(Configuration_Destination); } else { commandLineScraper.DestinationSuggestedName = Configuration_Destination; } } if (Configuration_SiteUrl != null) { commandLineScraper.AddToCrawl(Configuration_SiteUrl.AsUri()); } commandLineScraper.PerformInitialization(); if (commandLineScraper is FacebookScraper) { var postsCsv = Path.Combine(commandLineScraper.DestinationDirectory, "Posts.csv"); if ( !Configuration_RetryFailed && !Configuration_ReconsiderAll && !Configuration_FacebookUpdate && File.Exists(postsCsv)) { return; } if (Configuration_FacebookUpdate) { var candidates = WarcItem.ReadIndex(Path.Combine(commandLineScraper.DestinationDirectory, "index.cdx")); /*string user = null; * first.OpenStream((key, value) => { * if (key == "Cookie") user = value.TryCaptureBetween((Utf8String)"c_user="******";")?.ToString(); * }).Dispose();*/ // the id might also be in the redirected page var user = candidates.Take(2).Select(x => x.ReadText().TryCaptureBetween("USER_ID\":\"", "\"")).FirstOrDefault(x => x != null); string filename = user == "100009411610822" ? "facebookcookiesanisea.txt" : user == "1485885333" ? "facebookcookies.txt" : user == "100021796298990" ? "facebookcookiesgloria.txt" : user == null || user == "0" ? (string)null : throw new ArgumentException("Unknown facebook user: "******","), "yyyy-MM-dd HH:mm:ss", CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal); var fb = (FacebookScraper)commandLineScraper; fb.UpdateUpTo = stopAt; if (File.Exists(Path.Combine(commandLineScraper.DestinationDirectory, "fbgroup-not-a-member"))) { return; } if (File.Exists(Path.Combine(commandLineScraper.DestinationDirectory, "fberror"))) { return; } fb.SetStatus(fb.Root, UrlStatus.ToCrawl); if (long.TryParse(fb.Page, out var id)) { var url = "https://www.facebook.com/profile.php?id=" + id; if (fb.GetStatus(url) != UrlStatus.UnknownUrl) { fb.SetStatus(url.AsUri(), UrlStatus.ToCrawl); } } if (filename != null) { filename = Path.Combine(@"C:\Users\Andrea\OneDrive\QwawaDesktop\Scraping", filename); commandLineScraper.Cookies = File.ReadAllText(filename).Trim().TrimStart("Cookie:").Trim(); } } } if (Configuration_ReconsiderAll) { commandLineScraper.ReconsiderForScraping("**"); } else { commandLineScraper.ReconsiderSkippedUrls(); } if (Configuration_RetryFailed) { commandLineScraper.ReconsiderFailedUrls(); } /* * foreach (var line in File.ReadAllLines(@"C:\Users\Andrea\Desktop\arcilesbica-robots.txt")) * { * var l = line.Trim(); * if (l.Length == 0) continue; * commandLineScraper.AddToCrawl("http://www.arcilesbica.it"+l); * } */ if (!Configuration_FacebookMakeCsv) { await commandLineScraper.ScrapeAsync(); } if (commandLineScraper is FacebookScraper f) { //f.AddImagesToCrawl(); //f.ReconsiderSkippedUrls(); //await f.ScrapeAsync(); Console.WriteLine("Creating post list"); f.CreatePostList(); } } return; } #if SCRAPING_SANDBOX using (var scraper = new WebsiteScraper()) { InitScraperDefaults(scraper); /* * scraper.DestinationSuggestedName = "fb-anisea-friends"; * scraper.Cookies = File.ReadAllText(@"C:\Repositories\Awdee\Awdee2.Declarative\phantomjs\facebookcookiesanisea.txt").Trim(); * scraper.ShouldScrape = (url, prereq) => * { * if (prereq) return true; * return false; * }; * scraper.ForceLinks = "a:link-has-host-path('m.facebook.com'; '/friends/center/friends/')"; * scraper.AddToCrawl("https://m.facebook.com/friends/center/friends/", true); */ /* * var friends = scraper.CdxIndex.Values.Where(x => x.Url.Contains("/friends/center/friends/")) * .SelectMany(x => * { * var zz = x.ReadHtml(); * return zz.FindAll("div.w.bk").Select(z=> { * var m = z.TryGetValue("div.bn.bo", pattern: @"^([,\d]+) mutual"); * return new FacebookFriend * { * Name = z.TryGetValue("img", "alt"), * MutualFriends = m != null ? int.Parse(m.Replace(",", string.Empty)) : -1, * Id = long.Parse(z.GetLinkUrl("a").GetQueryParameter("uid")) * }; * }); * }).ToList(); * JsonFile.Save(friends, Path.Combine(scraper.DestinationDirectory, "friends.json")); */ /* * var friends = JsonFile.Read<List<FacebookFriend>>(Path.Combine(scraper.DestinationDirectory, "friends.json")) * .OrderByDescending(x => x.MutualFriends); */ //friends.OpenExcel(); /* * await friends.ForEachThrottledAsync(async x => * { * Console.WriteLine(x.Name); * var p = await scraper.GetHtmlNodeAsync(("https://m.facebook.com/" + x.Id).AsUri()); * }, Debugger.IsAttached ? 1 : scraper.Parallelism); */ /* * foreach (var item in scraper.CdxIndex.Values.Where(x=>x.Url.EndsWith("_rdr"))) * { * Console.WriteLine(item.Url); * scraper.CrawlLinks(item.ReadHtml()); * } * scraper.SaveDatabase(); * await scraper.ScrapeAsync(); */ /* * var actuallyExisting = new HashSet<string>(scraper.CdxIndex.Keys); * * foreach (var item in scraper.GetScrapedUrls(false)) * { * if (!actuallyExisting.Contains(item.AbsoluteUri)) * { * scraper.SetStatus(item, UrlStatus.ToCrawl); * } * } * return;*/ var resourcesOnly = true; scraper.AddToCrawl("http://knowyourmeme.com/"); if (!resourcesOnly) { scraper.Parallelism = 2; scraper.InterRequestDelay = TimeSpan.FromSeconds(0.4); } scraper.HtmlReceived += (a, b, page) => { if (page.FindSingle("h3:contains('include your ip address then there')") != null) { scraper.Dispose(); throw new Exception("IP banned."); } }; scraper.RewriteLink = (url) => { if (url.Query == "?fb") { return(url.GetLeftPart(UriPartial.Path).AsUri()); } return(url); }; scraper.ShouldScrape = (url, prereq) => { if (url.Contains("kym") && url.EndsWith(".jpg")) { } if (resourcesOnly && url.IsHostedOn("knowyourmeme.com")) { return(false); } if (!resourcesOnly && !url.IsHostedOn("knowyourmeme.com")) { return(false); } //if (url.IsHostedOn("kym-cdn.com")) return false; // todo if (url.IsHostedOn("imgur.com")) { return(false); // todo } if (url.IsHostedOn("meme.am")) { return(false); } //if (!url.IsHostedOn("kym-cdn.com")) return false; if (prereq) { return(true); } if (!scraper.IsSubfolderOfFirstUrl(url)) { return(false); } if (url.HasQueryParameters()) { return(false); } if (url.AbsolutePath == "/") { return(true); } if (url.PathEndsWith("/photos/page/1")) { return(false); } if (url.PathEndsWith("/videos/page/1")) { return(false); } if (url.PathEndsWith("/editorships")) { return(false); } if (url.PathEndsWith("/ask")) { return(false); } if (url.PathContainsComponent("/photos/trending")) { return(false); } if (url.PathContainsComponent("/videos/trending")) { return(false); } //if (url.PathStartsWith("/page/")) return true; if (url.PathContainsComponent("/memes/popular/")) { return(false); } if (url.PathContainsComponent("/deadpool/")) { return(false); } if (url.PathContainsComponent("/memes/researching/page/")) { return(false); } if (url.PathContainsComponent("/edits/")) { return(false); } if (url.PathContainsComponent("/sort/")) { return(false); } if (url.PathContainsComponent("/favorites/")) { return(false); } if (url.PathContainsComponent("/new/")) { return(false); } if (url.PathStartsWith("/memes/")) { return(true); } //if (url.PathContains("/photos/")) return false; //if (url.PathEndsWith("/children")) return false; return(false); }; scraper.ReconsiderForScraping("**"); //await scraper.ScrapeFailedImagesFromKnownHostsAsync(); await scraper.ScrapeAsync(); } #else Console.WriteLine("Bad command line."); #endif }
public WarcItem WriteRecord(string url, bool isresponse, MemoryStream req, DateTime date, string ip, string recordId, string concurrentTo, LazyUri shamanUrl) { var initialPosition = outstream.Position; StartRecord(); currentRecord.WriteClrStringLine("WARC/1.0"); if (isresponse) { currentRecord.WriteClrStringLine("WARC-Type: response"); } else { currentRecord.WriteClrStringLine("WARC-Type: request"); } if (isresponse) { currentRecord.WriteClrStringLine("Content-Type: application/http;msgtype=response"); } else { currentRecord.WriteClrStringLine("Content-Type: application/http;msgtype=request"); } currentRecord.WriteClrString("WARC-Date: "); currentRecord.WriteClrString(date.ToString("o").Substring(0, 19)); currentRecord.WriteClrStringLine("Z"); currentRecord.WriteClrString("WARC-Record-ID: <urn:uuid:"); currentRecord.WriteClrString(recordId); currentRecord.WriteClrStringLine(">"); currentRecord.WriteClrString("WARC-Target-URI: "); currentRecord.WriteClrStringLine(url); if (shamanUrl != null) { var abs = shamanUrl.AbsoluteUri; if (abs != url) { currentRecord.WriteClrString("WARC-Shaman-URI: "); currentRecord.WriteClrStringLine(abs); } } currentRecord.WriteClrString("WARC-IP-Address: "); currentRecord.WriteClrStringLine(ip); if (concurrentTo != null) { currentRecord.WriteClrString("WARC-Concurrent-To: <urn:uuid:"); currentRecord.WriteClrString(concurrentTo); currentRecord.WriteClrStringLine(">"); } currentRecord.WriteClrString("Content-Length: "); currentRecord.Write(req.Length); currentRecord.WriteLine(); currentRecord.WriteClrString("WARC-Warcinfo-ID: <urn:uuid:"); currentRecord.WriteClrString(WarcInfoId); currentRecord.WriteClrStringLine(">"); currentRecord.WriteLine(); req.TryGetBuffer(out var buf); currentRecord.Write(buf.Array.Slice(buf.Offset, (int)req.Length)); EndRecord(); if (isresponse) { req.Seek(0, SeekOrigin.Begin); scratchpad.Reset(); using (var http = new Utf8StreamReader(req, true)) { using (var s = WarcItem.OpenHttp(http, scratchpad, url.AsUri(), req.Length, out var payloadLength, out var location, out var responseCode, out var contentType, out var lastModified, null)) { if (payloadLength == -1) { var l = 0; while (true) { var m = s.Read(lengthCalculationBuffer, 0, lengthCalculationBuffer.Length); if (m == 0) { break; } l += m; } payloadLength = l; } var warcItem = new WarcItem() { Url = shamanUrl?.AbsoluteUri ?? url, Date = date, ContentType = contentType.ToStringCached(), LastModified = lastModified, PayloadLength = payloadLength, ResponseCode = (HttpStatusCode)responseCode, CompressedLength = outstream.Position - initialPosition, CompressedOffset = initialPosition, WarcFile = WarcName }; recordedResponses.Add(warcItem); onNewWarcItem?.Invoke(warcItem); return(warcItem); } } } return(null); }
public static void GenerateCdx(string cdx, IEnumerable <string> warcs) { var scratchpad = new Scratchpad(); var buf = new byte[16 * 1024]; using (var output = File.Open(cdx + ".tmp", FileMode.Create, FileAccess.Write, FileShare.Delete | FileShare.Read)) { using (var writer = new Utf8StreamWriter(output)) { writer.WriteClrStringLine(WarcColumns); foreach (var warc in warcs) { Console.WriteLine(Path.GetFileName(warc)); using (var warcStream = File.Open(warc, FileMode.Open, FileAccess.Read, FileShare.Delete | FileShare.Read)) { try { var warcname = new Utf8String(Path.GetFileName(warc)); while (warcStream.Position != warcStream.Length) { var startPosition = warcStream.Position; long warcContentLength = -1; long payloadLength = -1; int responseCode = -1; var contentType = Utf8String.Empty; var date = scratchpad.Use(14); date[0] = 0; Utf8String url = Utf8String.Empty; Utf8String shamanUrl = Utf8String.Empty; DateTime? lastModified = null; bool isresponse = false; using (var gz = new GZipStream(warcStream, CompressionMode.Decompress, true)) { using (var reader = new Utf8StreamReader(gz, true)) { while (true) { if (reader.IsCompleted) { throw new EndOfStreamException(); } var line = reader.ReadLine(); if (line.Length == 0) { break; } if (line.Equals(Warc_Response)) { isresponse = true; } if (line.StartsWith(Warc_ContentLength)) { warcContentLength = Utf8Utils.ParseInt64(WarcItem.GetHeaderValue(line)); } else if (line.StartsWith(Warc_Date)) { var val = WarcItem.GetHeaderValue(line).Bytes; val.Slice(0, 4).CopyTo(date.Slice(0)); val.Slice(5, 2).CopyTo(date.Slice(4)); val.Slice(8, 2).CopyTo(date.Slice(6)); val.Slice(11, 2).CopyTo(date.Slice(8)); val.Slice(14, 2).CopyTo(date.Slice(10)); val.Slice(17, 2).CopyTo(date.Slice(12)); } else if (line.StartsWith(Warc_URL)) { url = scratchpad.Copy(WarcItem.GetHeaderValue(line)); } if (line.StartsWith(Warc_Shaman_URI)) { shamanUrl = scratchpad.Copy(WarcItem.GetHeaderValue(line)); } } if (warcContentLength == -1) { throw new InvalidOperationException(); } if (isresponse) { using (var s = WarcItem.OpenHttp(reader, scratchpad, new Uri(url.ToString()), warcContentLength, out var payloadLengthFromHeader, out var redirectLocation, out responseCode, out contentType, out lastModified, null)) { long l = 0; while (true) { var m = s.Read(buf, 0, buf.Length); if (m == 0) { break; } l += m; } payloadLength = l; if (payloadLengthFromHeader != -1 && payloadLengthFromHeader != payloadLength) { throw new Exception("Content-Length mismatch."); } } //var httpData = new LimitedStream(reader, contentLength); var cr = reader.ReadByte(); if (cr != 13) { throw new InvalidDataException(); } var lf = reader.ReadByte(); if (lf != 10) { throw new InvalidDataException(); } cr = reader.ReadByte(); if (cr != 13) { throw new InvalidDataException(); } lf = reader.ReadByte(); if (lf != 10) { throw new InvalidDataException(); } //if (reader.ReadByte() != 13) throw new Exception(); //if (reader.ReadByte() != 10) throw new Exception(); } else { var remaining = warcContentLength; while (remaining != 0) { var m = reader.Read((int)Math.Min(remaining, int.MaxValue)); if (m.Count == 0) { throw new Exception(); } remaining -= m.Count; } var e = reader.ReadLine(); if (e.Length != 0) { throw new InvalidDataException(); } e = reader.ReadLine(); if (e.Length != 0) { throw new InvalidDataException(); } e = reader.ReadLine(); if (!reader.IsCompleted) { throw new InvalidDataException(); } } //var r = reader.RemainingBufferedData; var end = reader.ReadByte(); if (end != -1) { throw new InvalidDataException(); } //Console.WriteLine($"Remaining: {r.Length}"); } warcStream.Position -= GetRemainingUnusedBytes(gz); } if (isresponse) { if (shamanUrl.Length > 0) { writer.Write(shamanUrl); } else { writer.Write(url); } writer.Write((byte)' '); writer.Write(startPosition); writer.Write((byte)' '); writer.Write(warcStream.Position - startPosition); writer.Write((byte)' '); if (date[0] != 0) { writer.Write(date); } else { writer.Write((byte)'-'); } writer.Write((byte)' '); writer.Write(warcname); writer.Write((byte)' '); if (responseCode != -1) { writer.Write(responseCode); } else { writer.Write((byte)'-'); } writer.Write((byte)' '); writer.Write(contentType); writer.Write((byte)' '); writer.Write(payloadLength); writer.Write((byte)' '); if (lastModified != null) { WriteDate(writer, lastModified.Value); } else { writer.Write((byte)'-'); } writer.Write((byte)' '); writer.WriteLine(); } scratchpad.Reset(); } } catch { if (warcStream.Position == warcStream.Length) { Console.WriteLine("WARNING: truncated WARC.");; } else { throw; } } } } } } File.Delete(cdx); File.Move(cdx + ".tmp", cdx); }