/// <summary> /// Builds the specified destination folder. /// </summary> /// <param name="destinationFolder">The destination folder.</param> /// <param name="baseUrl">The base URL.</param> public void Build(string destinationFolder, string baseUrl) { generatedFiles.Clear(); this.baseUrl = baseUrl; this.destinationFolder = destinationFolder; rootFilePath = Path.Combine(destinationFolder, DefaultPage); DirectoryHelper.Delete(destinationFolder); Directory.CreateDirectory(destinationFolder); downloader = new PageDownloader(destinationFolder, new Uri(baseUrl)); downloader.Processor = ClearHtml; folderTree = new StringBuilder(defaultHeader); IList <Category> categories = categoryManager.GetRootLevel(); foreach (var category in categories) { ExportCategory(tocFile.ChildNodes, category, 0); } folderTree.AppendLine(defaultFooter); File.WriteAllText(rootFilePath, folderTree.ToString()); generatedFiles.Add(rootFilePath); CreateJavascripts(); }
public void Should_crawl_website() { var configuration = new Configuration(); var pageDownloader = new PageDownloader(configuration); var htmlParser = new HtmlParser(); var pageCrawler = new SinglePageCrawler(htmlParser, pageDownloader); var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler").DocumentStore; var persister = new RavenDbCrawlPersister(documentStore); var urlHasher = new UrlHasher(); var crawlUrlRepository = new InMemoryCrawlUrlRepository(); var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister); var task = websiteCrawler.RunAsync(new Website { RootUrl = "http://www.karenmillen.com/", MaxConcurrentConnections = 100 }); // task.Wait(new TimeSpan(0, 10, 0)); // task.Wait(new TimeSpan(0, 2, 0)); task.Wait(); task.Status.ShouldBeEquivalentTo(TaskStatus.RanToCompletion); var result = task.Result; Console.WriteLine("Crawl completed: {0} urls crawled in {1}", result.NumberOfPagesCrawled, (result.CrawlEnded - result.CrawlStarted).ToString()); }
private async void GetPagesOnStart() { var tmp = await PageDownloader.DownloadPagesAsync(PageLibraryManager.GetLinks()); pages = tmp; Log("Se ha completado la descarga inicial de páginas."); screenSecondary.Visibility = Visibility.Collapsed; }
public BlogPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount, IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader) { _blogClient = blogClient; _blogAccount = blogAccount; _credentials = credentials; _blogHomepageUrl = blogHomepageUrl; _pageDownloader = pageDownloader; }
public void Should_extract_links_from_page() { var configuration = new Configuration(); var pageDownloader = new PageDownloader(configuration); var htmlParser = new HtmlParser(); var crawler = new SinglePageCrawler(htmlParser, pageDownloader); var result = crawler.Crawl(new Uri("http://vladpetroff.com")); }
static void Main() { XmlConfigurator.Configure(); var log = LogManager.GetLogger(typeof(Program)); var configuration = new Configuration(); var pageDownloader = new PageDownloader(configuration); var htmlParser = new HtmlParser(); var pageCrawler = new SinglePageCrawler(htmlParser, pageDownloader); var urlHasher = new UrlHasher(); var documentStore = new DocumentStoreInitializer("http://localhost:8080", "NetCrawler2").DocumentStore; // var documentStore = new DocumentStoreInitializer("http://SLB-4B6WZN1:8080", "NetCrawler2").DocumentStore; var persister = new RavenDbCrawlPersister(documentStore); // var crawlUrlRepository = new InMemoryCrawlUrlRepository(); var crawlUrlRepository = new RedisCrawlUrlRepository(); var websiteCrawler = new WebsiteCrawler(new CrawlScheduler(urlHasher, configuration, pageCrawler, crawlUrlRepository), persister); var task = websiteCrawler.RunAsync(new [] { new Website { RootUrl = "http://www.karenmillen.com/", MaxConcurrentConnections = 25 }, new Website { RootUrl = "http://uk.tommy.com/", MaxConcurrentConnections = 25 }, new Website { RootUrl = "http://www.houseoffraser.co.uk/", MaxConcurrentConnections = 25 }, new Website { RootUrl = "http://vladpetroff.com/", MaxConcurrentConnections = 25 }, }); var result = task.Result; log.InfoFormat("Crawl completed: {0} urls crawled in {1}", result.Sum(x => x.NumberOfPagesCrawled), (result.Max(x => x.CrawlEnded) - result.Min(x => x.CrawlStarted)).ToString()); }
public static void Main(string[] args) { Console.WriteLine("Launch at " + DateTime.Now.ToString()); Console.WriteLine("Version " + System.Reflection.Assembly.GetExecutingAssembly().GetName().Version.ToString()); // 実施判定 if (!hasToCheckUpdate()) { return; } // URL管理 UrlManager manager = new UrlManager(); // URL展開 UrlExtractor extractor = new UrlExtractor(); List <string> urls = extractor.ExtractUrls(); // ご新規さんを抽出 List <string> newUrls = manager.selectNonExists(urls); Console.WriteLine(" total: " + urls.Count + ", new: " + newUrls.Count); // ダウンロード PageDownloader downloader = new PageDownloader(); foreach (string url in newUrls) { System.Threading.Thread.Sleep(1000); Console.WriteLine(" new address: " + url); string fileName = downloader.Download(url); if (fileName != null) { manager.addUrl(url); } else { Console.WriteLine(" ...fail!!"); } } Console.WriteLine("Finish at " + DateTime.Now.ToString()); }
static void Main(string[] args) { var path = Environment.GetFolderPath(Environment.SpecialFolder.Personal); var filename = Path.Combine(path, "change.csv"); if (!File.Exists(filename)) { IEnumerable <string> lines = new List <string>() { "CAMBIO CHF/EUR", "\"Giorno e ora\";\"cambio\"" }; File.AppendAllLines(filename, lines); } var _sut = new PageDownloader(); var page = _sut.DownloadPage(); var result = page.GetChangeValue(); File.AppendAllLines(filename, new [] { $"\"{DateTime.Now.ToString("s", CultureInfo.InvariantCulture)}\";\"{result.ToString(CultureInfo.InvariantCulture)}\"" }); }
public TestsSample(PageDownloader sut) { _sut = sut; }
public TemporaryPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount, IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader, BlogPostRegionLocatorBooleanCallback promptForTempPost) : base(blogClient, blogAccount, credentials, blogHomepageUrl, pageDownloader) { this.containsBlogPosts = promptForTempPost; }
public RecentPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount, IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader) : base(blogClient, blogAccount, credentials, blogHomepageUrl, pageDownloader) { }
public WebBot(ILogger <WebBot> logger, PageDownloader pageDownloader) { this.logger = logger; this.pageDownloader = pageDownloader; }