public static void Run(frmMain parentForm, Book book) { form = parentForm; IsolatedStorageModule.Setup(false); currentBook = book; existingReviewIds = CrawlUtil.getNewContext().Reviews.Where(r => r.bookId == currentBook.id).Select(r => r.id).ToList(); baseUri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&format=html&isbn=" + book.isbn + "&links=660&min_rating=&review_back=fff&stars=000&text=000"; c = new Crawler(new Uri(baseUri), new HtmlDocumentProcessor(), // Process html new ReviewIFrameDumperStep()); // Custom step to visualize crawl c.MaximumThreadCount = 1; //** 2012-09-03 changed this from 2 to 1 in hopes that it'll fix the unknown (seemingly) random crashes. c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; // Begin crawl c.Crawl(); }
public static void Run() { IsolatedStorageModule.Setup(false); Console.Out.WriteLine("Simple crawl demo using IsolatedStorage"); // Setup crawler to crawl http://ncrawler.codeplex.com // with 1 thread adhering to robot rules, and maximum depth // of 2 with 4 pipeline steps: // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Processes PDF files, extracting text // * Step 3 - Try to determine language based on page, based on text extraction, using google language detection // * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction new GoogleLanguageDetection(), // Add language detection new DumperStep()) { // Custom step to visualize crawl MaximumThreadCount = 2, MaximumCrawlDepth = 10, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public static void Run(MainForm parentForm, Book book, bool getRatingStats, bool getReviews) { form = parentForm; IsolatedStorageModule.Setup(false); GetRatingStats = getRatingStats; GetReviews = getReviews; currentBook = book; baseUri = book.reviewPageURL; /* * 140185852 We * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852 * http://www.amazon.com/We-Yevgeny-Zamyatin/product-reviews/0140185852/ref=cm_cr_pr_btm_link_4?pageSize=50&pageNumber=4&sortBy=recent * * 618260307 The Hobbit http://www.amazon.com/The-Hobbit-J-R-R-Tolkien/product-reviews/0618260307 */ baseUri += "/ref=cm_cr_pr_btm_link_1?pageSize=50&pageNumber=1"; if (!currentBook.reviewPageURL.Contains("/ref=cm_cr_pr_btm_link")) { currentBook.reviewPageURL = baseUri; //hack to make isFirstPage() work [2016-02-04] } c = new Crawler(new Uri(baseUri), new HtmlDocumentProcessor(), // Process html new ReviewPageProcessStep(), new SaveFileStep()); // Custom step to visualize crawl c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; c.BeforeDownload += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload); string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); //if there are no unblocked user agents left then reset the tracker and retry if (ua == null) { UserAgentTracker = CrawlUtil.InitUserAgentTracker(); ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); } c.UserAgent = ua; // Begin crawl c.Crawl(); }
public static void Run(frmMain parentForm, int fromPage, int toPage) { form = parentForm; _fromPage = fromPage; _toPage = toPage; IsolatedStorageModule.Setup(false); c = new Crawler(new Uri("http://www.goodreads.com/list/show/1.Best_Books_Ever?page=" + FromPage), new HtmlDocumentProcessor(), // Process html new DumperStep()); // Custom step to visualize crawl c.MaximumThreadCount = 2; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; // Begin crawl c.Crawl(); }
public static void Run(MainForm parentForm, Book book, bool getDetailsAndAuthor, bool getRanks) { form = parentForm; IsolatedStorageModule.Setup(false); GetDetailsAndAuthor = getDetailsAndAuthor; GetRanks = getRanks; currentBook = book; baseUri = book.detailPageURL; Uri u = new Uri(Uri.EscapeUriString(baseUri)); c = new Crawler(u, new HtmlDocumentProcessor(), // Process html new DetailPageDumperStep()); // Custom step to visualize crawl c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); //if there are no unblocked user agents left then reset the tracker and retry if (ua == null) { UserAgentTracker = CrawlUtil.InitUserAgentTracker(); ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker); } c.UserAgent = ua; // Begin crawl c.Crawl(); }
public static void Run(MainForm parentForm, Author author) { form = parentForm; IsolatedStorageModule.Setup(false); currentAuthor = author; baseUri = author.detailPageURL; c = new Crawler(new Uri(baseUri), new HtmlDocumentProcessor(), // Process html new AuthorDetailPageDumperStep()); // Custom step to visualize crawl c.MaximumThreadCount = 1; c.MaximumCrawlDepth = 1; c.ExcludeFilter = CrawlUtil.ExtensionsToSkip; c.AdhereToRobotRules = false; c.UserAgent = @"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/537.4"; // Begin crawl c.Crawl(); }