예제 #1
0
        public override void eventUniversal(crawlReportingStageEnum stage, directReporterBase __parent, crawlerDomainTask __task, modelSpiderSiteRecord wRecord)
        {
            switch (stage)
            {
            case crawlReportingStageEnum.DLCPreinitiation:

                wRecord.context.OnTargetPageAttached += new modelSpiderSiteRecordEvent(onTargetPageAttached);

                imbMCRepository mcRepo = mcm.activeRepository;

                imbMCWebSite wRepo = mcRepo.GetWebSite(wRecord.domainInfo, true, loger);
                if (!webSiteReposByDomain.ContainsKey(wRecord.domain))
                {
                    webSiteReposByDomain.Add(wRecord.domain, wRepo);
                }
                else
                {
                    loger.log("DLC sent to CrawlToMC plugin second time: " + wRecord.domain);
                }

                mcRepo.siteTable.AddOrUpdate(wRepo.entry);

                wRepo.SaveDataStructure(mcRepo.folder, loger);
                break;
            }
        }
예제 #2
0
        /// <summary>
        /// Gets all web pages registered in the <see cref="imbMCWebSite.pageTable" /> loaded, if <c>takeSettings</c> used then returns sampleTake
        /// </summary>
        /// <param name="site">The site repo to take pages for</param>
        /// <param name="output">The log output.</param>
        /// <param name="takeSettings">If specified it will return only fraction of web sites, according to the sampling settings.</param>
        /// <returns>
        /// All web pages in the site repo , or sampleTake if sampling settings specified
        /// </returns>
        public List <imbMCWebPage> GetAllWebPages(imbMCWebSite site, ILogBuilder output = null, samplingSettings takeSettings = null)
        {
            if (output == null)
            {
                output = aceLog.loger;
            }

            var all = site.pageTable.GetList();

            if (takeSettings != null)
            {
                all = new sampleTake <imbMCWebPageEntry>(all, takeSettings);
            }

            List <imbMCWebPage> pages = new List <imbMCWebPage>();

            foreach (var pe in all)
            {
                var repo = pe.HashCode.LoadDataStructure <imbMCWebPage>(site.folder, output);
                if (repo != null)
                {
                    pages.Add(repo);
                }
            }

            return(pages);
        }
예제 #3
0
        /// <summary>
        /// Gets the web page repository by resolved URL
        /// </summary>
        /// <param name="site">The site to query page from</param>
        /// <param name="url">The fully resolved URL.</param>
        /// <param name="autoCreate">if set to <c>true</c> [automatic create].</param>
        /// <param name="output">The output.</param>
        /// <returns></returns>
        /// <exception cref="ArgumentException">Url must be in full and resolved form, and must come from the same root domain name (different TLD is allowed) - url</exception>
        public imbMCWebPage GetWebPage(imbMCWebSite site, string url, bool autoCreate = false, ILogBuilder output = null)
        {
            if (output == null)
            {
                output = aceLog.loger;
            }


            if (!url.Contains(site.domainInfo.domainRootName))
            {
                if (doThrowOnDomainMismatch)
                {
                    throw new ArgumentException($"Url [{url}] must be in full and resolved form, and must come from the same root domain name [{site.entry.domain}] (different TLD is allowed)", nameof(url));
                }
            }

            string HashCode = md5.GetMd5Hash(url);


            if (!autoCreate && (!site.pageTable.ContainsKey(HashCode)))
            {
                output.log($"Page repository {HashCode} (for: {url}) not found at {site.folder.path}");
                return(null);
            }

            var entry = site.pageTable.GetOrCreate(HashCode);

            imbMCWebPage repo = HashCode.LoadDataStructure <imbMCWebPage>(site.folder, output);

            repo.deploy(entry);

            site.pageTable.AddOrUpdate(entry);

            return(repo);
        }
예제 #4
0
        public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            imbMCRepository mcRepo = mcm.activeRepository;
            imbMCWebSite    wRepo  = webSiteReposByDomain[__wRecord.domain];

            mcRepo.siteTable.AddOrUpdate(wRepo.entry);
            wRepo.SaveDataStructure(mcRepo.folder, loger);
        }
예제 #5
0
        private void onTargetPageAttached(modelSpiderSiteRecord __wRecord, modelSpiderSiteRecordEventArgs __args)
        {
            imbMCRepository mcRepo = mcm.activeRepository;
            imbMCWebSite    wRepo  = webSiteReposByDomain[__wRecord.domain];

            ISpiderTarget target = __args.Target;

            if (mcRepo.isTargetProper(target))
            {
                imbMCWebPage pRepo = mcRepo.BuildWebPage(target, wRepo, loger);
                pRepo.indexEntry     = imbWEMManager.index.pageIndexTable.GetPageForUrl(target.url);
                pRepo.HtmlSourceCode = __args.sourceHtml;
                pRepo.XmlSourceCode  = __args.sourceXml;

                pRepo.SaveDataStructure(wRepo.folder, loger);
            }
        }
예제 #6
0
        /// <summary>
        /// Builds the web page repository using <see cref="ISpiderTarget"/> crawl information
        /// </summary>
        /// <param name="target">Target information</param>
        /// <param name="site">The site to build page for</param>
        /// <param name="output">The output for console/log</param>
        /// <returns>Built or updated web page repository</returns>
        public imbMCWebPage BuildWebPage(ISpiderTarget target, imbMCWebSite site, ILogBuilder output = null)
        {
            imbMCWebPage page  = GetWebPage(site, target.url, true, output);
            ISpiderPage  sPage = target.page;

            page.entry.AnchorTextAll       = sPage.captions.toCsvInLine(",");
            page.entry.ClickDepth          = sPage.iterationDiscovery;
            page.entry.ResolvedRelativeURL = site.domainInfo.GetURLWithoutDomainName(target.url);

            page.deploy(page.entry);

            page.indexEntry = target.GetIndexPage();

            page.TextContent = target.pageText;
            page.name        = target.page.name;

            var htmlDoc = target.GetHtmlDocument();

            if (htmlDoc != null)
            {
                page.HtmlSourceCode = htmlDoc.DocumentNode.OuterHtml; // ; = target.contentBlocks;
            }
            else
            {
            }

            page.Blocks = new List <imbCommonModels.contentBlock.nodeBlock>();


            page.TermTable = target.tokens.GetCompiledTable(output);

            target.contentBlocks.ForEach(x => page.Blocks.Add(x));

            site.pageTable.AddOrUpdate(page.entry);

            page.SaveDataStructure(site.folder, output);

            return(page);
        }
예제 #7
0
        /// <summary>
        /// Builds or updates web site repositorium using crawling information.
        /// </summary>
        /// <param name="targetCollection">Collection of SpiderTargets, populated by DLC crawl</param>
        /// <param name="domainInfo">DLC domain information</param>
        /// <param name="output">The output.</param>
        /// <returns>
        /// Reference to created or updated web site repository
        /// </returns>
        /// <remarks>
        /// This method uses completed DLC information to create <see cref="imbMCWebSite" /> repository and <see cref="imbMCWebPage" /> for all proper targets
        /// </remarks>
        public imbMCWebSite BuildWebSite(ISpiderTargetCollection targetCollection, domainAnalysis domainInfo, ILogBuilder output = null)
        {
            //Int32 siteCount = siteTable.Count;
            int pageCount = 0;

            imbMCWebSite repo = GetWebSite(domainInfo, true, output);



            pageCount = repo.pageTable.Count;

            if (pageCount == 0)
            {
                loger.log("Web site repository created [" + domainInfo.domainName + "]");
            }

            List <ISpiderTarget> crawledTargets = targetCollection.GetLoaded();

            foreach (ISpiderTarget target in crawledTargets)
            {
                if (isTargetProper(target))
                {
                    BuildWebPage(target, repo);
                }
            }

            int nPageCount = repo.pageTable.Count - pageCount;

            if (nPageCount > 0)
            {
                loger.log("Repository [" + domainInfo.domainName + "] expanded for [" + nPageCount + "] new pages, in total [" + (pageCount + nPageCount) + "] pages.");
            }

            siteTable.AddOrUpdate(repo.entry);

            repo.SaveDataStructure(folder, output);

            return(repo);
        }
예제 #8
0
        /// <summary>
        /// Gets web site repositorium by clean domain name, like: "koplas.co.rs" for http://www.koplas.co.rs
        /// </summary>
        /// <param name="domainInfo">The domain information.</param>
        /// <param name="autoCreate">if set to <c>true</c> it will automatically create new entry and new repository</param>
        /// <param name="output">The log/diagnostic output</param>
        /// <returns></returns>
        public imbMCWebSite GetWebSite(domainAnalysis domainInfo, bool autoCreate = false, ILogBuilder output = null)
        {
            if (!autoCreate && (!siteTable.ContainsKey(domainInfo.domainRootName)))
            {
                return(null);
            }

            if (output == null)
            {
                output = aceLog.loger;
            }


            imbMCWebSiteEntry entry = siteTable.GetOrCreate(domainInfo.domainRootName);

            entry.domainProperUrl = domainInfo.urlProper;

            imbMCWebSite repo = entry.domain.LoadDataStructure <imbMCWebSite>(folder, output);

            repo.domainInfo = domainInfo;

            repo.deploy(entry);

            entry.domainProperUrl = repo.domainInfo.urlProper;

            siteTable.AddOrUpdate(entry);

            if (repo.folder == null)
            {
                if (output != null)
                {
                    output.log("Warning: folder instance is null in web site repo [" + repo.name + "]");
                }
            }

            return(repo);
        }