예제 #1
0
        public static void Run(frmMain parentForm, Book book)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            currentBook = book;

            existingReviewIds = CrawlUtil.getNewContext().Reviews.Where(r => r.bookId == currentBook.id).Select(r => r.id).ToList();

            baseUri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&format=html&isbn=" + book.isbn + "&links=660&min_rating=&review_back=fff&stars=000&text=000";

            c = new Crawler(new Uri(baseUri),
                            new HtmlDocumentProcessor(), // Process html
                            new ReviewIFrameDumperStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1; //** 2012-09-03 changed this from 2 to 1 in hopes that it'll fix the unknown (seemingly) random crashes.
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;

            // Begin crawl
            c.Crawl();
        }
예제 #2
0
        public static void Run()
        {
            IsolatedStorageModule.Setup(false);
            Console.Out.WriteLine("Simple crawl demo using IsolatedStorage");

            // Setup crawler to crawl http://ncrawler.codeplex.com
            // with 1 thread adhering to robot rules, and maximum depth
            // of 2 with 4 pipeline steps:
            //	* Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Processes PDF files, extracting text
            //  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
            //  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class
            using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
                                       new HtmlDocumentProcessor(),                         // Process html
                                       new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction
                                       new GoogleLanguageDetection(),                       // Add language detection
                                       new DumperStep())
            {
                // Custom step to visualize crawl
                MaximumThreadCount = 2,
                MaximumCrawlDepth = 10,
                ExcludeFilter = Program.ExtensionsToSkip,
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
예제 #3
0
        public static void Run(MainForm parentForm, Book book, bool getRatingStats, bool getReviews)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            GetRatingStats = getRatingStats;
            GetReviews     = getReviews;
            currentBook    = book;
            baseUri        = book.reviewPageURL;

            /*
             * 140185852	We
             * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852
             * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852/ref=cm_cr_pr_btm_link_4?pageSize=50&pageNumber=4&sortBy=recent
             *
             * 618260307	The Hobbit	http://www.amazon.com/The-Hobbit-J-R-R-Tolkien/product-reviews/0618260307
             */
            baseUri += "/ref=cm_cr_pr_btm_link_1?pageSize=50&pageNumber=1";

            if (!currentBook.reviewPageURL.Contains("/ref=cm_cr_pr_btm_link"))
            {
                currentBook.reviewPageURL = baseUri; //hack to make isFirstPage() work [2016-02-04]
            }

            c = new Crawler(new Uri(baseUri),
                            new HtmlDocumentProcessor(), // Process html
                            new ReviewPageProcessStep(),
                            new SaveFileStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;
            c.BeforeDownload    += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload);

            string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);

            //if there are no unblocked user agents left then reset the tracker and retry
            if (ua == null)
            {
                UserAgentTracker = CrawlUtil.InitUserAgentTracker();
                ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);
            }
            c.UserAgent = ua;

            // Begin crawl
            c.Crawl();
        }
예제 #4
0
        public static void Run(frmMain parentForm, int fromPage, int toPage)
        {
            form      = parentForm;
            _fromPage = fromPage;
            _toPage   = toPage;
            IsolatedStorageModule.Setup(false);

            c = new Crawler(new Uri("http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + FromPage),
                            new HtmlDocumentProcessor(), // Process html
                            new DumperStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 2;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;

            // Begin crawl
            c.Crawl();
        }
예제 #5
0
        public static void Run(MainForm parentForm, Book book, bool getDetailsAndAuthor, bool getRanks)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            GetDetailsAndAuthor = getDetailsAndAuthor;
            GetRanks            = getRanks;
            currentBook         = book;
            baseUri             = book.detailPageURL;

            Uri u = new Uri(Uri.EscapeUriString(baseUri));

            c = new Crawler(u,
                            new HtmlDocumentProcessor(), // Process html
                            new DetailPageDumperStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;

            string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);

            //if there are no unblocked user agents left then reset the tracker and retry
            if (ua == null)
            {
                UserAgentTracker = CrawlUtil.InitUserAgentTracker();
                ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);
            }
            c.UserAgent = ua;

            // Begin crawl
            c.Crawl();
        }
예제 #6
0
        public static void Run(MainForm parentForm, Author author)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            currentAuthor = author;
            baseUri       = author.detailPageURL;

            c = new Crawler(new Uri(baseUri),
                            new HtmlDocumentProcessor(), // Process html
                            new AuthorDetailPageDumperStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;

            c.UserAgent = @"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4";

            // Begin crawl
            c.Crawl();
        }