Esempio n. 1
0
        public MvcContrib.Pagination.IPagination <CrawlerSessionViewModel> GetPagedList(int page, int size)
        {
            CrawlerSession  crawlerSessionAlias = null;
            CrawlerSettings settingsAlias       = null;
            var             query = Session.QueryOver <CrawlerSession>(() => crawlerSessionAlias).
                                    JoinAlias(() => crawlerSessionAlias.Settings, () => settingsAlias, JoinType.LeftOuterJoin).OrderBy(x => x.DateTime).Asc;

            var count      = query.ToRowCountQuery();
            var totalCount = count.FutureValue <int>();

            var firstResult = (page - 1) * size;

            CrawlerSessionViewModel viewModel = null;



            var viewModels =
                query.SelectList(list => list
                                 .Select(x => x.Id).WithAlias(() => viewModel.Id)
                                 .Select(x => x.Title).WithAlias(() => viewModel.Title)
                                 .Select(x => x.DateTime).WithAlias(() => viewModel.DateTime)
                                 .Select(x => x.StartUrl).WithAlias(() => viewModel.StartUrl))
                .TransformUsing(Transformers.AliasToBean(typeof(CrawlerSessionViewModel)))
                .Skip(firstResult)
                .Take(size)
                .Future <CrawlerSessionViewModel>();

            return(new CustomPagination <CrawlerSessionViewModel>(viewModels, page, size, totalCount.Value));
        }
Esempio n. 2
0
        // constructor
        public CrawlerEngine(CrawlerSettings cSetting, CrawlerInput cInput, CrawlerOutput cOutput)
        {
            this.crawlerSettings_ = cSetting;
            this.crawlerInput_    = cInput;
            this.crawlerOutput_   = cOutput;

            queueUrls_           = new Queue();
            this.threadsRunning_ = new Thread[cSetting.maxThreadCount_];
        }
Esempio n. 3
0
        static void Main(string[] args)
        {
            CrawlerSettings settings = new CrawlerSettings()
            {
                Function      = MyFunction,
                OutputPath    = "Sample.txt",
                RespectRobots = true,
                Seeds         = new string[] { @"http://5by5.tv/", @"http://maximumfun.org/", @"https://www.relay.fm/" },
                MaxDepth      = 8,
                WorkerCount   = 64
            };

            IEnumerable <string> banedExts = new string[]
            {
                // images
                ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif",
                ".tiff", ".ai", ".drw", ".dxf", ".eps", ".ps", ".svg",

                // audio
                ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".au", ".aiff",

                // video
                ".3gp", ".asf", ".asx", ".avi", ".mov", ".mp4", ".mpg", ".qt", ".rm", ".swf", ".wmv",
                ".m4a",

                //other
                ".css", ".pdf", ".doc", ".exe", ".bin", ".rss", ".zip", ".rar"
            };

            IEnumerable <string> bannedUrls = new string[]
            {
                "twitter.com",
                "youtube.com",
                "reddit.com",
                "facebook.com",
                "amazon.com",
                "itunes.apple.com",
                "firstpost.com",
                "wikipedia.org",
                "play.google.com",
                "pinterest.com"
            };

            s_extractor = new LinkExtractor(banedExts, bannedUrls);

            Crawler crawler = new Crawler(settings);

            crawler.Crawl();
        }
Esempio n. 4
0
        static void Main(string[] args)
        {
            string strMIMETypes = @"text/richtext[0,0];text/html[0,0];audio/x-aiff[0,0];";

            strMIMETypes += @"audio/basic[0,0];audio/wav[0,0];image/gif[0,0];image/jpeg[0,0];";
            strMIMETypes += @"image/pjpeg[0,0];image/tiff[0,0];image/x-png[0,0];image/x-xbitmap[0,0];";
            strMIMETypes += @"image/bmp[0,0];image/x-jg[0,0];image/x-emf[0,0];image/x-wmf[0,0];";
            strMIMETypes += @"video/avi[0,0];video/mpeg[0,0];application/postscript[0,0];application/base64[0,0];";
            strMIMETypes += @"application/macbinhex40[0,0];application/pdf[0,0];application/x-compressed[0,0];";
            strMIMETypes += @"application/x-zip-compressed[0,0];application/x-gzip-compressed[0,0];";
            strMIMETypes += @"application/java[0,0];application/x-msdownload[0,0];";
            CrawlerSettings settings = new CrawlerSettings();

            settings.allowAllMIMETypes_ = false;
            settings.downloadfolder_    = "downloadfolder1";
            settings.excludeFiles_      = new string[] { ".gif", ".jpg", ".css", ".zip", ".exe" };
            settings.excludeHosts_      = new string[] { "" };
            settings.excludeWords_      = new string[] { "" };
            settings.keepAlive_         = false;
            settings.keepSameServer_    = false;
            settings.lastRequestCount_  = 0;
            settings.allowedMIMETypes_  = strMIMETypes;
            settings.requestTimeout_    = 10;
            settings.sleepConnectTime_  = 0;
            settings.sleepFetchTime_    = 0;
            settings.threadsCount_      = 1;
            settings.maxThreadCount_    = 20;
            settings.maxDepth_          = 1;
            settings.filePath_          = "CrawlerConsoleSettings.txt";
            settings.lastModified_      = DateTime.Now;
            settings.version_           = 1;
            settings.dataTypeName_      = "CrawlerConsoleSettings";
            settings.WriteToFile();
            settings.ReadFromFile("CrawlerConsoleSettings.txt");

            CrawlerInput input = new CrawlerInput();

            input.domain_  = "baidu.com";
            input.fullUrl_ = "www.baidu.com";

            CrawlerOutput output = new CrawlerOutput();
            CrawlerEngine engine = new CrawlerEngine(settings, input, output);

            engine.RunCrawling();
        }
Esempio n. 5
0
        public void SmokeTest()
        {
            var rootPath = @"C:\1\";
            var uri      = new Uri(@"http:\\ya.ru");
            var dwnMock  = new Moq.Mock <IDownloader>();

            dwnMock.Setup(i => i.Download(uri))
            .Returns(() =>
            {
                var result = new DownloaderResult(uri);
                result.SetResponseData(new WebPageContent()
                {
                    Encoding      = Encoding.UTF8,
                    IsHtmlContent = true,
                    Bytes         = Encoding.UTF8.GetBytes(_yaSiteHtml)
                });
                return(result);
            });

            var pageFileStorageMock = new Moq.Mock <IPageFileSystemStorage>();

            pageFileStorageMock.Setup(i => i.SavePage(null, rootPath, true));

            var downLoader         = dwnMock.Object;
            var settings           = new CrawlerSettings();
            var downloadManager    = new DownloadManager(downLoader, new InMemoryLinkDataStorage(), settings);
            var webPageLinkManager = new WebPageLinkManager();
            var engine             = new CrawlerEngine(downloadManager, webPageLinkManager, pageFileStorageMock.Object);

            var taskSettings = new CrawlerTaskSettings()
            {
                CrawlDepth         = 1,
                IgnoreOtherDomains = false,
                ReplaceUrlToLocal  = true
            };

            var task = new CrawlerTask(uri, rootPath, taskSettings);

            var page = engine.ProcessCrawlerTask(task).Result;

            Assert.AreEqual(page.Uri, uri);
            Assert.IsTrue(page.IsHtml);
            Assert.AreEqual(page.Html, _yaSiteHtml);
        }
        CrawlerReport RunAnalysis(Uri startUrl)
        {
            var settings = new CrawlerSettings(startUrl);
            settings.UseUserAgentForRobots = true;
            settings.ExternalLinkCriteria = ExternalLinkCriteria.SameFolderAndDeeper;
            // Generate a unique name
            var name = settings.Name = "SEOREPORT" + DateTime.Now.ToString("yy-MM-dd hh-mm-ss");

            // Use the same directory as the default used by the UI
            var path = Path.Combine(
                Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments),
                "IIS SEO Reports");
            settings.IgnoreRobots = true;
            settings.IgnoreNoIndex = true;
            settings.IgnoreNoFollow = true;
            settings.Timeout = 200000;
            settings.MaximumLinkCount = MaxPages;
            settings.DirectoryCache = Path.Combine(path, settings.Name);

            // Create a new crawler and start running
            var crawler = new WebCrawler(settings);

            crawler.Start();

            while (crawler.IsRunning)
            {
                Thread.Sleep(2000);
                log.LogMessage("{0,9:N0} - {1,9:N0} - {2,9:N2} MB",
                               crawler.Report.GetUrlCount(),
                               crawler.RemainingUrls,
                               crawler.BytesDownloaded/1048576.0f);
            }

            crawler.Report.Save(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments) + "/IIS Seo Reports/");

            return crawler.Report;
        }