예제 #1
0
        protected override void Initialize()
        {
            // this.Parallelism = Math.Min(this.Parallelism, 6);
            var p = Page;

            if (p.StartsWith("http:") || p.StartsWith("https:"))
            {
                var uu = p.AsUri();
                p = uu.AbsolutePath.Trim('/');

                if (p.StartsWith("groups/"))
                {
                    p = "groups/" + uu.GetPathComponent(1);
                }
                else if (uu.AbsolutePath.StartsWith("/profile.php"))
                {
                    p = uu.GetQueryParameter("id");
                }

                Page = p;
            }
            if (p.StartsWith("groups-"))
            {
                p = "groups/" + p.TrimStart("groups-");
            }
            var root = ("https://m.facebook.com/" + p.TrimStart("/")).AsUri();


            if (this.DestinationSuggestedName == null)
            {
                this.DestinationSuggestedName = "facebook-" + (root.GetQueryParameter("id") ?? root.AbsolutePath.TrimStart("/").TrimEnd("/").Replace("/", "-"));
                if (!ScrapeMobileVersion)
                {
                    this.DestinationSuggestedName += "-desktop";
                }
            }


            if (!ScrapeMobileVersion)
            {
                DatabaseInitialized += () =>
                {
                    if (fburl != null)
                    {
                        if (fburl.Contains("group/"))
                        {
                            InitGroup();
                        }
                    }
                };
                UrlPriorityDelegate = x => (x.Contains("/ajax/") ? -1000 : 0) + x.Length;
                root = ("https://www.facebook.com" + root.PathAndQuery).AsUri();
                this.ShouldScrape = (url, prereq) =>
                {
                    if (url.IsHostedOn("fbcdn.net") && url.AbsolutePath.EndsWith(".js"))
                    {
                        return(false);
                    }
                    if (url.IsHostedOn("fbcdn.net") && url.AbsolutePath.EndsWith(".css"))
                    {
                        return(false);
                    }

                    //if (url.IsHostedOn("fbcdn.net")) return false;

                    if (prereq && DownloadThumbnailsAndCss)
                    {
                        return(true);
                    }
                    if (Utils.Equals(url, root))
                    {
                        return(true);
                    }
                    if (url.IsHostedOn("facebook.com") && url.HasExactlyQueryParameters("id") && url.GetQueryParameter("id") == Page)
                    {
                        return(true);
                    }
                    if (url.Fragment.StartsWith("#$"))
                    {
                        if (url.PathContains("GroupEntstreamPagelet"))
                        {
                            return(true);
                        }
                        if (url.PathContains("/ProfileTimelineSectionPagelet"))
                        {
                            return(true);
                        }
                    }
                    if (url.PathStartsWith("/pages_reaction_units"))
                    {
                        return(true);
                    }
                    return(false);
                };
                this.AddToCrawl(root);
                NonHtmlReceived += (url, easy, body) =>
                {
                    if (url.PathStartsWith("/pages_reaction_units"))
                    {
                        ProcessPageReactionUnitsRequest(url, body);
                    }
                };



                CollectAdditionalLinks += (url, page) =>
                {
                    if (Stop)
                    {
                        return(null);
                    }
                    if (HttpUtils.UrisEqual(url, root) || (url.IsHostedOn("facebook.com") && url.HasExactlyQueryParameters("id") && url.GetQueryParameter("id") == Page))
                    {
                        fburl = page.TryGetValue(":property('al:android:url')");
                        var sentinel = Path.Combine(DestinationDirectory, url.AbsolutePath.StartsWith("/groups/") ? "fbgroup-not-a-member" : "fberror");
                        if (fburl == null)
                        {
                            File.WriteAllText(sentinel, string.Empty, Encoding.ASCII);
                            return(Enumerable.Empty <(Uri, Boolean)>());
                        }
                        File.Delete(sentinel);
                        fbNumericId = fburl.Capture(@"/(\d+)(\?.*)?$");

                        if (fburl.Contains("group/"))
                        {
                            InitGroup();
                        }
                        else if (fburl.Contains("page/"))
                        {
                            var initialUrl = ReadFromExample("fbpage");
                            //defaultParameters = initialUrl.FragmentParameters.ToDictionary();

                            initialUrl.AppendQueryParameter("page_id", fbNumericId);
                            if (UpdateUpTo != null)
                            {
                                refreshId = DateTime.UtcNow.ToString("yyyyMMdd");
                                initialUrl.AppendQueryParameter("x-refresh", refreshId);
                            }

                            AddToCrawl(initialUrl.Url);
                        }
                        else if (fburl.Contains("profile/"))
                        {
                            AddToCrawl(root);

                            AddToCrawl(root + "/about?section=overview", true);
                            AddToCrawl(root + "/about?section=education", true);
                            AddToCrawl(root + "/about?section=living", true);
                            AddToCrawl(root + "/about?section=contact-info", true);
                            AddToCrawl(root + "/about?section=relationship", true);
                            AddToCrawl(root + "/about?section=bio", true);
                            AddToCrawl(root + "/about?section=year-overviews", true);
                            AddToCrawl(root + "/likes", true);
                            AddToCrawl(root + "/photos", true);
                            AddToCrawl(root + "/friends", true);

                            var pageletToken = page.TryGetValue(@"script:json-token('jscc_map:'):json-token('£pagelet_token')");

                            if (pageletToken == null)
                            {
                                pageletToken = page.GetValue("script:json-token('pagelet_token:')");
                            }

                            AddFacebookProfileSegment(((long)((DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc)).TotalSeconds)), fbNumericId, pageletToken);
                        }
                        else
                        {
                            throw new NotSupportedException("Not supported: " + fburl);
                        }
                    }
                    return(null);
                };

                CollectAdditionalLinks += (url, page) =>
                {
                    if (Stop)
                    {
                        return(null);
                    }
                    //Console.WriteLine("Size: " + page.OuterHtml.Length);
                    var html =
                        page.OwnerDocument.IsJson() ?
                        page.FindAll(Configuration_PageReactionUnitsSelector) :
                        page.FindAll(Configuration_PageletExtractionSelector).Concat(new[] { RehydrateHtml(page) });
                    //: Configuration_PageletExtractionSelector);

                    /*
                     * var hasSetMainProgress = false;
                     *
                     * void MaybeSetMainProgress(HtmlNode y)
                     * {
                     *  if (y != null && !hasSetMainProgress && y.TagName == "abbr")
                     *  {
                     *      var datestr = y.GetAttributeValue("title") ?? y.GetText();
                     *      if (datestr != null)
                     *      {
                     *          hasSetMainProgress = true;
                     *          SetMainProgressStatus(datestr);
                     *      }
                     *  }
                     * }
                     */
//                    MaybeSetMainProgress(html.Select(x=>x.FindSingle("abbr"));


                    var dates = new List <long>();
                    var zzz   = html.SelectMany(x =>
                    {
                        if (fburl.Contains("profile/"))
                        {
                            var articles = x.FindAll("[data-ftr]");
                            foreach (var article in articles)
                            {
                                var ut = GetRootArticleDate(article);
                                if (ut != null)
                                {
                                    dates.Add(ut.Value);
                                }
                            }
                        }


                        var zz = x.DescendantsAndSelf().Select(y =>
                        {
                            //MaybeSetMainProgress(y);
                            if (y.GetAttributeValue("data-ftr") != null)
                            {
                                var ut = GetRootArticleDate(y);
                                if (ut != null)
                                {
                                    dates.Add(ut.Value);
                                }
                            }

                            /*
                             * var ajaxify = y.GetAttributeValue("ajaxify");
                             * if (ajaxify != null && ajaxify.StartsWith("/pages_reaction_units/"))
                             * {
                             *
                             *  var u = new Uri("https://www.facebook.com" + ajaxify);
                             *  return (Url: u, false);
                             * }
                             */

                            var z = y.TryGetLinkUrl();
                            if (z != null && !z.Contains("pages_reaction_unit"))
                            {
                                return(Url: z, y.TagName.In("img", "script") || y.GetAttributeValue("rel")?.ToLowerFast() == "stylesheet");
                            }
                            return(null, false);
                        }).Where(y => y.Url != null).ToList();



                        return(zz);
                    }).ToList();
                    var mindate = dates.Count == 0 ? (long?)null : dates.Count == 1 ? dates[0] : dates.Skip(1).Min();
                    if (mindate != null)
                    {
                        SetMainProgressStatus(new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc).AddSeconds(mindate.Value).ToString("MMM d, yyyy H:mm"));
                    }

                    var mustStop = mindate != null && IsBeforeLimitDate(mindate.Value);
                    if (fburl.Contains("/profile"))
                    {
                        if (!mustStop && mindate != null)
                        {
                            var pageletToken = html.Select(x => x.TryGetValue("script:json-token('£jscc_map'):json-token('£pagelet_token')")).FirstOrDefault(x => x != null);
                            if (pageletToken != null)
                            {
                                AddFacebookProfileSegment(mindate.Value - 1, fbNumericId, pageletToken);
                            }
                        }
                    }
                    else
                    {
                        if (mustStop)
                        {
                            Stop = true;
                        }
                    }


                    /*
                     * if (UpdateUpTo != null)
                     * {
                     *  var qq = zzz.Where(x => x.Url.Contains("pages_reaction_unit")).Distinct().ToList();
                     *  if (qq.Count >= 2)
                     *  {
                     *
                     *  }
                     *  var u = zzz.FindIndex(x=>x.Url.Contains("pages_reaction_unit"));
                     *  if (u != -1)
                     *  {
                     *      var m = new LazyUri(zzz[u].Url);
                     *      m.AppendQueryParameter("x-refresh", refreshId);
                     *      zzz[u] = (m.Url, zzz[u].Item2);
                     *  }
                     * }
                     */
                    return(zzz);
                };
            }
            else
            {
                this.CollectAdditionalLinks += (url, page) =>
                {
                    return(page.FindAll("a[href*='sectionLoadingID='],a[href*='bacr']:text-is('See More Posts')").Select(x =>
                    {
                        var u = x.TryGetLinkUrl();
                        var sb = ReseekableStringBuilder.AcquirePooledStringBuilder();
                        sb.Append(u.GetLeftPart(UriPartial.Path));
                        HttpUtils.AppendQueryParameters(u.GetQueryParameters().Select(y =>
                        {
                            var val = y.Value;
                            if (y.Key == "timecutoff")
                            {
                                val = "0";
                            }
                            else if (y.Key == "refid")
                            {
                                val = "0";
                            }
                            else if (y.Key == "sectionLoadingID")
                            {
                                val = "x";
                            }
                            else if (y.Key == "yearSectionsYears")
                            {
                                val = "";
                            }
                            return new KeyValuePair <string, string>(y.Key, val);
                        }), sb);
                        HttpUtils.AppendQueryParameter("srw", "1", sb);
                        var rewritten = ReseekableStringBuilder.GetValueAndRelease(sb);
                        return (rewritten.AsUri(), false);
                    }));
                };

                this.ShouldScrape = (url, isPrerequisite) =>
                {
                    if (DownloadThumbnailsAndCss && isPrerequisite)
                    {
                        return(true);
                    }
                    if (url == root)
                    {
                        return(true);
                    }
                    if (url.IsHostedOn("facebook.com") && url.HasExactlyQueryParameters("id") && url.GetQueryParameter("id") == Page)
                    {
                        return(true);
                    }
                    if (!url.IsHostedOn("m.facebook.com"))
                    {
                        return(false);
                    }
                    if (url.GetQueryParameter("srw") != null)
                    {
                        /*
                         * if (url.Query.StartsWith("?sectionLoadingID="))
                         * {
                         *  // Click Show more only for individual years, not the global one.
                         *  return url.GetQueryParameter("timestart") != "0";
                         * }
                         */
                        return(true);
                    }

                    if (DownloadComments || DownloadFullSizeImages)
                    {
                        if (UrlRuleMatcher.IsSubfolderOf(url, root))
                        {
                            if (url.HasNoQueryParameters() && string.IsNullOrEmpty(url.Fragment))
                            {
                                return(true);
                            }
                            AddToCrawl(url.GetLeftPart(UriPartial.Path));
                            return(false);
                        }
                    }

                    return(false);
                };
                this.AddToCrawl(root);
            }
            this.Root = root;
        }