public MvcContrib.Pagination.IPagination <CrawlerSessionViewModel> GetPagedList(int page, int size) { CrawlerSession crawlerSessionAlias = null; CrawlerSettings settingsAlias = null; var query = Session.QueryOver <CrawlerSession>(() => crawlerSessionAlias). JoinAlias(() => crawlerSessionAlias.Settings, () => settingsAlias, JoinType.LeftOuterJoin).OrderBy(x => x.DateTime).Asc; var count = query.ToRowCountQuery(); var totalCount = count.FutureValue <int>(); var firstResult = (page - 1) * size; CrawlerSessionViewModel viewModel = null; var viewModels = query.SelectList(list => list .Select(x => x.Id).WithAlias(() => viewModel.Id) .Select(x => x.Title).WithAlias(() => viewModel.Title) .Select(x => x.DateTime).WithAlias(() => viewModel.DateTime) .Select(x => x.StartUrl).WithAlias(() => viewModel.StartUrl)) .TransformUsing(Transformers.AliasToBean(typeof(CrawlerSessionViewModel))) .Skip(firstResult) .Take(size) .Future <CrawlerSessionViewModel>(); return(new CustomPagination <CrawlerSessionViewModel>(viewModels, page, size, totalCount.Value)); }
// constructor public CrawlerEngine(CrawlerSettings cSetting, CrawlerInput cInput, CrawlerOutput cOutput) { this.crawlerSettings_ = cSetting; this.crawlerInput_ = cInput; this.crawlerOutput_ = cOutput; queueUrls_ = new Queue(); this.threadsRunning_ = new Thread[cSetting.maxThreadCount_]; }
static void Main(string[] args) { CrawlerSettings settings = new CrawlerSettings() { Function = MyFunction, OutputPath = "Sample.txt", RespectRobots = true, Seeds = new string[] { @"http://5by5.tv/", @"http://maximumfun.org/", @"https://www.relay.fm/" }, MaxDepth = 8, WorkerCount = 64 }; IEnumerable <string> banedExts = new string[] { // images ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".ai", ".drw", ".dxf", ".eps", ".ps", ".svg", // audio ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".au", ".aiff", // video ".3gp", ".asf", ".asx", ".avi", ".mov", ".mp4", ".mpg", ".qt", ".rm", ".swf", ".wmv", ".m4a", //other ".css", ".pdf", ".doc", ".exe", ".bin", ".rss", ".zip", ".rar" }; IEnumerable <string> bannedUrls = new string[] { "twitter.com", "youtube.com", "reddit.com", "facebook.com", "amazon.com", "itunes.apple.com", "firstpost.com", "wikipedia.org", "play.google.com", "pinterest.com" }; s_extractor = new LinkExtractor(banedExts, bannedUrls); Crawler crawler = new Crawler(settings); crawler.Crawl(); }
static void Main(string[] args) { string strMIMETypes = @"text/richtext[0,0];text/html[0,0];audio/x-aiff[0,0];"; strMIMETypes += @"audio/basic[0,0];audio/wav[0,0];image/gif[0,0];image/jpeg[0,0];"; strMIMETypes += @"image/pjpeg[0,0];image/tiff[0,0];image/x-png[0,0];image/x-xbitmap[0,0];"; strMIMETypes += @"image/bmp[0,0];image/x-jg[0,0];image/x-emf[0,0];image/x-wmf[0,0];"; strMIMETypes += @"video/avi[0,0];video/mpeg[0,0];application/postscript[0,0];application/base64[0,0];"; strMIMETypes += @"application/macbinhex40[0,0];application/pdf[0,0];application/x-compressed[0,0];"; strMIMETypes += @"application/x-zip-compressed[0,0];application/x-gzip-compressed[0,0];"; strMIMETypes += @"application/java[0,0];application/x-msdownload[0,0];"; CrawlerSettings settings = new CrawlerSettings(); settings.allowAllMIMETypes_ = false; settings.downloadfolder_ = "downloadfolder1"; settings.excludeFiles_ = new string[] { ".gif", ".jpg", ".css", ".zip", ".exe" }; settings.excludeHosts_ = new string[] { "" }; settings.excludeWords_ = new string[] { "" }; settings.keepAlive_ = false; settings.keepSameServer_ = false; settings.lastRequestCount_ = 0; settings.allowedMIMETypes_ = strMIMETypes; settings.requestTimeout_ = 10; settings.sleepConnectTime_ = 0; settings.sleepFetchTime_ = 0; settings.threadsCount_ = 1; settings.maxThreadCount_ = 20; settings.maxDepth_ = 1; settings.filePath_ = "CrawlerConsoleSettings.txt"; settings.lastModified_ = DateTime.Now; settings.version_ = 1; settings.dataTypeName_ = "CrawlerConsoleSettings"; settings.WriteToFile(); settings.ReadFromFile("CrawlerConsoleSettings.txt"); CrawlerInput input = new CrawlerInput(); input.domain_ = "baidu.com"; input.fullUrl_ = "www.baidu.com"; CrawlerOutput output = new CrawlerOutput(); CrawlerEngine engine = new CrawlerEngine(settings, input, output); engine.RunCrawling(); }
public void SmokeTest() { var rootPath = @"C:\1\"; var uri = new Uri(@"http:\\ya.ru"); var dwnMock = new Moq.Mock <IDownloader>(); dwnMock.Setup(i => i.Download(uri)) .Returns(() => { var result = new DownloaderResult(uri); result.SetResponseData(new WebPageContent() { Encoding = Encoding.UTF8, IsHtmlContent = true, Bytes = Encoding.UTF8.GetBytes(_yaSiteHtml) }); return(result); }); var pageFileStorageMock = new Moq.Mock <IPageFileSystemStorage>(); pageFileStorageMock.Setup(i => i.SavePage(null, rootPath, true)); var downLoader = dwnMock.Object; var settings = new CrawlerSettings(); var downloadManager = new DownloadManager(downLoader, new InMemoryLinkDataStorage(), settings); var webPageLinkManager = new WebPageLinkManager(); var engine = new CrawlerEngine(downloadManager, webPageLinkManager, pageFileStorageMock.Object); var taskSettings = new CrawlerTaskSettings() { CrawlDepth = 1, IgnoreOtherDomains = false, ReplaceUrlToLocal = true }; var task = new CrawlerTask(uri, rootPath, taskSettings); var page = engine.ProcessCrawlerTask(task).Result; Assert.AreEqual(page.Uri, uri); Assert.IsTrue(page.IsHtml); Assert.AreEqual(page.Html, _yaSiteHtml); }
CrawlerReport RunAnalysis(Uri startUrl) { var settings = new CrawlerSettings(startUrl); settings.UseUserAgentForRobots = true; settings.ExternalLinkCriteria = ExternalLinkCriteria.SameFolderAndDeeper; // Generate a unique name var name = settings.Name = "SEOREPORT" + DateTime.Now.ToString("yy-MM-dd hh-mm-ss"); // Use the same directory as the default used by the UI var path = Path.Combine( Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), "IIS SEO Reports"); settings.IgnoreRobots = true; settings.IgnoreNoIndex = true; settings.IgnoreNoFollow = true; settings.Timeout = 200000; settings.MaximumLinkCount = MaxPages; settings.DirectoryCache = Path.Combine(path, settings.Name); // Create a new crawler and start running var crawler = new WebCrawler(settings); crawler.Start(); while (crawler.IsRunning) { Thread.Sleep(2000); log.LogMessage("{0,9:N0} - {1,9:N0} - {2,9:N2} MB", crawler.Report.GetUrlCount(), crawler.RemainingUrls, crawler.BytesDownloaded/1048576.0f); } crawler.Report.Save(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments) + "/IIS Seo Reports/"); return crawler.Report; }