Exemple #1
0
        public static void Run(SavePageForm parentForm, string url)
        {
            Form = parentForm;
            Url  = url;

            c = new Crawler(new Uri(url),
                            new HtmlDocumentProcessor(), // Process html
                            new SaveFileStep());

            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;
            c.CrawlFinished     += new EventHandler <NCrawler.Events.CrawlFinishedEventArgs>(c_CrawlFinished);

            string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);

            //if there are no unblocked user agents left then reset the tracker and retry
            if (ua == null)
            {
                UserAgentTracker = CrawlUtil.InitUserAgentTracker();
                ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);
            }
            c.UserAgent = ua;

            // Begin crawl
            c.Crawl();
        }
        public static void Run(MainForm parentForm, Book book, bool getRatingStats, bool getReviews)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            GetRatingStats = getRatingStats;
            GetReviews     = getReviews;
            currentBook    = book;
            baseUri        = book.reviewPageURL;

            /*
             * 140185852	We
             * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852
             * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852/ref=cm_cr_pr_btm_link_4?pageSize=50&pageNumber=4&sortBy=recent
             *
             * 618260307	The Hobbit	http://www.amazon.com/The-Hobbit-J-R-R-Tolkien/product-reviews/0618260307
             */
            baseUri += "/ref=cm_cr_pr_btm_link_1?pageSize=50&pageNumber=1";

            if (!currentBook.reviewPageURL.Contains("/ref=cm_cr_pr_btm_link"))
            {
                currentBook.reviewPageURL = baseUri; //hack to make isFirstPage() work [2016-02-04]
            }

            c = new Crawler(new Uri(baseUri),
                            new HtmlDocumentProcessor(), // Process html
                            new ReviewPageProcessStep(),
                            new SaveFileStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;
            c.BeforeDownload    += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload);

            string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);

            //if there are no unblocked user agents left then reset the tracker and retry
            if (ua == null)
            {
                UserAgentTracker = CrawlUtil.InitUserAgentTracker();
                ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);
            }
            c.UserAgent = ua;

            // Begin crawl
            c.Crawl();
        }
        public static void Run(MainForm parentForm, Book book, bool getDetailsAndAuthor, bool getRanks)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            GetDetailsAndAuthor = getDetailsAndAuthor;
            GetRanks            = getRanks;
            currentBook         = book;
            baseUri             = book.detailPageURL;

            Uri u = new Uri(Uri.EscapeUriString(baseUri));

            c = new Crawler(u,
                            new HtmlDocumentProcessor(), // Process html
                            new DetailPageDumperStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;

            string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);

            //if there are no unblocked user agents left then reset the tracker and retry
            if (ua == null)
            {
                UserAgentTracker = CrawlUtil.InitUserAgentTracker();
                ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);
            }
            c.UserAgent = ua;

            // Begin crawl
            c.Crawl();
        }