protected override void Initialize() { // this.Parallelism = Math.Min(this.Parallelism, 6); var p = Page; if (p.StartsWith("http:") || p.StartsWith("https:")) { var uu = p.AsUri(); p = uu.AbsolutePath.Trim('/'); if (p.StartsWith("groups/")) { p = "groups/" + uu.GetPathComponent(1); } else if (uu.AbsolutePath.StartsWith("/profile.php")) { p = uu.GetQueryParameter("id"); } Page = p; } if (p.StartsWith("groups-")) { p = "groups/" + p.TrimStart("groups-"); } var root = ("https://m.facebook.com/" + p.TrimStart("/")).AsUri(); if (this.DestinationSuggestedName == null) { this.DestinationSuggestedName = "facebook-" + (root.GetQueryParameter("id") ?? root.AbsolutePath.TrimStart("/").TrimEnd("/").Replace("/", "-")); if (!ScrapeMobileVersion) { this.DestinationSuggestedName += "-desktop"; } } if (!ScrapeMobileVersion) { DatabaseInitialized += () => { if (fburl != null) { if (fburl.Contains("group/")) { InitGroup(); } } }; UrlPriorityDelegate = x => (x.Contains("/ajax/") ? -1000 : 0) + x.Length; root = ("https://www.facebook.com" + root.PathAndQuery).AsUri(); this.ShouldScrape = (url, prereq) => { if (url.IsHostedOn("fbcdn.net") && url.AbsolutePath.EndsWith(".js")) { return(false); } if (url.IsHostedOn("fbcdn.net") && url.AbsolutePath.EndsWith(".css")) { return(false); } //if (url.IsHostedOn("fbcdn.net")) return false; if (prereq && DownloadThumbnailsAndCss) { return(true); } if (Utils.Equals(url, root)) { return(true); } if (url.IsHostedOn("facebook.com") && url.HasExactlyQueryParameters("id") && url.GetQueryParameter("id") == Page) { return(true); } if (url.Fragment.StartsWith("#$")) { if (url.PathContains("GroupEntstreamPagelet")) { return(true); } if (url.PathContains("/ProfileTimelineSectionPagelet")) { return(true); } } if (url.PathStartsWith("/pages_reaction_units")) { return(true); } return(false); }; this.AddToCrawl(root); NonHtmlReceived += (url, easy, body) => { if (url.PathStartsWith("/pages_reaction_units")) { ProcessPageReactionUnitsRequest(url, body); } }; CollectAdditionalLinks += (url, page) => { if (Stop) { return(null); } if (HttpUtils.UrisEqual(url, root) || (url.IsHostedOn("facebook.com") && url.HasExactlyQueryParameters("id") && url.GetQueryParameter("id") == Page)) { fburl = page.TryGetValue(":property('al:android:url')"); var sentinel = Path.Combine(DestinationDirectory, url.AbsolutePath.StartsWith("/groups/") ? "fbgroup-not-a-member" : "fberror"); if (fburl == null) { File.WriteAllText(sentinel, string.Empty, Encoding.ASCII); return(Enumerable.Empty <(Uri, Boolean)>()); } File.Delete(sentinel); fbNumericId = fburl.Capture(@"/(\d+)(\?.*)?$"); if (fburl.Contains("group/")) { InitGroup(); } else if (fburl.Contains("page/")) { var initialUrl = ReadFromExample("fbpage"); //defaultParameters = initialUrl.FragmentParameters.ToDictionary(); initialUrl.AppendQueryParameter("page_id", fbNumericId); if (UpdateUpTo != null) { refreshId = DateTime.UtcNow.ToString("yyyyMMdd"); initialUrl.AppendQueryParameter("x-refresh", refreshId); } AddToCrawl(initialUrl.Url); } else if (fburl.Contains("profile/")) { AddToCrawl(root); AddToCrawl(root + "/about?section=overview", true); AddToCrawl(root + "/about?section=education", true); AddToCrawl(root + "/about?section=living", true); AddToCrawl(root + "/about?section=contact-info", true); AddToCrawl(root + "/about?section=relationship", true); AddToCrawl(root + "/about?section=bio", true); AddToCrawl(root + "/about?section=year-overviews", true); AddToCrawl(root + "/likes", true); AddToCrawl(root + "/photos", true); AddToCrawl(root + "/friends", true); var pageletToken = page.TryGetValue(@"script:json-token('jscc_map:'):json-token('£pagelet_token')"); if (pageletToken == null) { pageletToken = page.GetValue("script:json-token('pagelet_token:')"); } AddFacebookProfileSegment(((long)((DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc)).TotalSeconds)), fbNumericId, pageletToken); } else { throw new NotSupportedException("Not supported: " + fburl); } } return(null); }; CollectAdditionalLinks += (url, page) => { if (Stop) { return(null); } //Console.WriteLine("Size: " + page.OuterHtml.Length); var html = page.OwnerDocument.IsJson() ? page.FindAll(Configuration_PageReactionUnitsSelector) : page.FindAll(Configuration_PageletExtractionSelector).Concat(new[] { RehydrateHtml(page) }); //: Configuration_PageletExtractionSelector); /* * var hasSetMainProgress = false; * * void MaybeSetMainProgress(HtmlNode y) * { * if (y != null && !hasSetMainProgress && y.TagName == "abbr") * { * var datestr = y.GetAttributeValue("title") ?? y.GetText(); * if (datestr != null) * { * hasSetMainProgress = true; * SetMainProgressStatus(datestr); * } * } * } */ // MaybeSetMainProgress(html.Select(x=>x.FindSingle("abbr")); var dates = new List <long>(); var zzz = html.SelectMany(x => { if (fburl.Contains("profile/")) { var articles = x.FindAll("[data-ftr]"); foreach (var article in articles) { var ut = GetRootArticleDate(article); if (ut != null) { dates.Add(ut.Value); } } } var zz = x.DescendantsAndSelf().Select(y => { //MaybeSetMainProgress(y); if (y.GetAttributeValue("data-ftr") != null) { var ut = GetRootArticleDate(y); if (ut != null) { dates.Add(ut.Value); } } /* * var ajaxify = y.GetAttributeValue("ajaxify"); * if (ajaxify != null && ajaxify.StartsWith("/pages_reaction_units/")) * { * * var u = new Uri("https://www.facebook.com" + ajaxify); * return (Url: u, false); * } */ var z = y.TryGetLinkUrl(); if (z != null && !z.Contains("pages_reaction_unit")) { return(Url: z, y.TagName.In("img", "script") || y.GetAttributeValue("rel")?.ToLowerFast() == "stylesheet"); } return(null, false); }).Where(y => y.Url != null).ToList(); return(zz); }).ToList(); var mindate = dates.Count == 0 ? (long?)null : dates.Count == 1 ? dates[0] : dates.Skip(1).Min(); if (mindate != null) { SetMainProgressStatus(new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc).AddSeconds(mindate.Value).ToString("MMM d, yyyy H:mm")); } var mustStop = mindate != null && IsBeforeLimitDate(mindate.Value); if (fburl.Contains("/profile")) { if (!mustStop && mindate != null) { var pageletToken = html.Select(x => x.TryGetValue("script:json-token('£jscc_map'):json-token('£pagelet_token')")).FirstOrDefault(x => x != null); if (pageletToken != null) { AddFacebookProfileSegment(mindate.Value - 1, fbNumericId, pageletToken); } } } else { if (mustStop) { Stop = true; } } /* * if (UpdateUpTo != null) * { * var qq = zzz.Where(x => x.Url.Contains("pages_reaction_unit")).Distinct().ToList(); * if (qq.Count >= 2) * { * * } * var u = zzz.FindIndex(x=>x.Url.Contains("pages_reaction_unit")); * if (u != -1) * { * var m = new LazyUri(zzz[u].Url); * m.AppendQueryParameter("x-refresh", refreshId); * zzz[u] = (m.Url, zzz[u].Item2); * } * } */ return(zzz); }; } else { this.CollectAdditionalLinks += (url, page) => { return(page.FindAll("a[href*='sectionLoadingID='],a[href*='bacr']:text-is('See More Posts')").Select(x => { var u = x.TryGetLinkUrl(); var sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); sb.Append(u.GetLeftPart(UriPartial.Path)); HttpUtils.AppendQueryParameters(u.GetQueryParameters().Select(y => { var val = y.Value; if (y.Key == "timecutoff") { val = "0"; } else if (y.Key == "refid") { val = "0"; } else if (y.Key == "sectionLoadingID") { val = "x"; } else if (y.Key == "yearSectionsYears") { val = ""; } return new KeyValuePair <string, string>(y.Key, val); }), sb); HttpUtils.AppendQueryParameter("srw", "1", sb); var rewritten = ReseekableStringBuilder.GetValueAndRelease(sb); return (rewritten.AsUri(), false); })); }; this.ShouldScrape = (url, isPrerequisite) => { if (DownloadThumbnailsAndCss && isPrerequisite) { return(true); } if (url == root) { return(true); } if (url.IsHostedOn("facebook.com") && url.HasExactlyQueryParameters("id") && url.GetQueryParameter("id") == Page) { return(true); } if (!url.IsHostedOn("m.facebook.com")) { return(false); } if (url.GetQueryParameter("srw") != null) { /* * if (url.Query.StartsWith("?sectionLoadingID=")) * { * // Click Show more only for individual years, not the global one. * return url.GetQueryParameter("timestart") != "0"; * } */ return(true); } if (DownloadComments || DownloadFullSizeImages) { if (UrlRuleMatcher.IsSubfolderOf(url, root)) { if (url.HasNoQueryParameters() && string.IsNullOrEmpty(url.Fragment)) { return(true); } AddToCrawl(url.GetLeftPart(UriPartial.Path)); return(false); } } return(false); }; this.AddToCrawl(root); } this.Root = root; }