public SqlDocumentStateService(AsgardDatabase AsgardDatabase, IMapper <DocumentState, DocumentStateModel> documentStateMapper) { _AsgardDatabase = AsgardDatabase; _documentStateMapper = documentStateMapper; AsgardDatabase.SetupOnce(); }
public SqlQueueService(AsgardDatabase AsgardDatabase, IMapper <QueueItem, QueueItemModel> queueMapper) { _AsgardDatabase = AsgardDatabase; _queueMapper = queueMapper; _AsgardDatabase.SetupOnce(); }
public void Can_Crawl_Blog() { var dbSettings = new SqlSettings(); var database = new AsgardDatabase(dbSettings); var queueService = new SqlQueueService(database, new QueueItemMapper()); var documentStateService = new SqlDocumentStateService(database, new DocumentStateMapper()); var linkScrapers = new ILinkScraper[] { new AgilityPackHrefLinkScraper() }; var pageConverters = new IPageConverter[] { new WebPageConverter() }; var downloadHandler = new HttpClientDownloadHandler(); var pageHandlers = new IPageHandler[] { new BinaryPageHandler(), new NotAuthorizedPageHandler(), new NotFoundPageHandler(), new WebPageHandler() }; var pageService = new ParallelPageService(linkScrapers, pageConverters, downloadHandler, pageHandlers); var stateService = new SqlStateService(new StateDatabase(new CommonDatabaseSettings() { CommonConnection = dbSettings.BifrostConnection })); var connector = new WebCrawlerConnector(queueService, documentStateService, pageService, stateService); var fetched = connector.ExecuteFetch(new WebConnectorJobConfiguration() { StartUrl = "http://blog.cwa.me.uk", JobName = "Test_cwablog", NumberOfPagesPerExecution = 10, Credential = null, DefaultVerifyFrequency = new Frequency() { Days = 1 }, Depth = 2, LinkFilter = new LinkFilter() { StayOnHost = true }, PageFilter = new PageFilter() { ExcludeBinaryPages = true } }); }
public SqlConsumerService(AsgardDatabase AsgardDatabase) { _AsgardDatabase = AsgardDatabase; AsgardDatabase.SetupOnce(); }