public override void eventUniversal(crawlReportingStageEnum stage, directReporterBase __parent, crawlerDomainTask __task, modelSpiderSiteRecord wRecord) { switch (stage) { case crawlReportingStageEnum.DLCPreinitiation: wRecord.context.OnTargetPageAttached += new modelSpiderSiteRecordEvent(onTargetPageAttached); imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = mcRepo.GetWebSite(wRecord.domainInfo, true, loger); if (!webSiteReposByDomain.ContainsKey(wRecord.domain)) { webSiteReposByDomain.Add(wRecord.domain, wRepo); } else { loger.log("DLC sent to CrawlToMC plugin second time: " + wRecord.domain); } mcRepo.siteTable.AddOrUpdate(wRepo.entry); wRepo.SaveDataStructure(mcRepo.folder, loger); break; } }
/// <summary> /// Gets all web pages registered in the <see cref="imbMCWebSite.pageTable" /> loaded, if <c>takeSettings</c> used then returns sampleTake /// </summary> /// <param name="site">The site repo to take pages for</param> /// <param name="output">The log output.</param> /// <param name="takeSettings">If specified it will return only fraction of web sites, according to the sampling settings.</param> /// <returns> /// All web pages in the site repo , or sampleTake if sampling settings specified /// </returns> public List <imbMCWebPage> GetAllWebPages(imbMCWebSite site, ILogBuilder output = null, samplingSettings takeSettings = null) { if (output == null) { output = aceLog.loger; } var all = site.pageTable.GetList(); if (takeSettings != null) { all = new sampleTake <imbMCWebPageEntry>(all, takeSettings); } List <imbMCWebPage> pages = new List <imbMCWebPage>(); foreach (var pe in all) { var repo = pe.HashCode.LoadDataStructure <imbMCWebPage>(site.folder, output); if (repo != null) { pages.Add(repo); } } return(pages); }
/// <summary> /// Gets the web page repository by resolved URL /// </summary> /// <param name="site">The site to query page from</param> /// <param name="url">The fully resolved URL.</param> /// <param name="autoCreate">if set to <c>true</c> [automatic create].</param> /// <param name="output">The output.</param> /// <returns></returns> /// <exception cref="ArgumentException">Url must be in full and resolved form, and must come from the same root domain name (different TLD is allowed) - url</exception> public imbMCWebPage GetWebPage(imbMCWebSite site, string url, bool autoCreate = false, ILogBuilder output = null) { if (output == null) { output = aceLog.loger; } if (!url.Contains(site.domainInfo.domainRootName)) { if (doThrowOnDomainMismatch) { throw new ArgumentException($"Url [{url}] must be in full and resolved form, and must come from the same root domain name [{site.entry.domain}] (different TLD is allowed)", nameof(url)); } } string HashCode = md5.GetMd5Hash(url); if (!autoCreate && (!site.pageTable.ContainsKey(HashCode))) { output.log($"Page repository {HashCode} (for: {url}) not found at {site.folder.path}"); return(null); } var entry = site.pageTable.GetOrCreate(HashCode); imbMCWebPage repo = HashCode.LoadDataStructure <imbMCWebPage>(site.folder, output); repo.deploy(entry); site.pageTable.AddOrUpdate(entry); return(repo); }
public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = webSiteReposByDomain[__wRecord.domain]; mcRepo.siteTable.AddOrUpdate(wRepo.entry); wRepo.SaveDataStructure(mcRepo.folder, loger); }
private void onTargetPageAttached(modelSpiderSiteRecord __wRecord, modelSpiderSiteRecordEventArgs __args) { imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = webSiteReposByDomain[__wRecord.domain]; ISpiderTarget target = __args.Target; if (mcRepo.isTargetProper(target)) { imbMCWebPage pRepo = mcRepo.BuildWebPage(target, wRepo, loger); pRepo.indexEntry = imbWEMManager.index.pageIndexTable.GetPageForUrl(target.url); pRepo.HtmlSourceCode = __args.sourceHtml; pRepo.XmlSourceCode = __args.sourceXml; pRepo.SaveDataStructure(wRepo.folder, loger); } }
/// <summary> /// Builds the web page repository using <see cref="ISpiderTarget"/> crawl information /// </summary> /// <param name="target">Target information</param> /// <param name="site">The site to build page for</param> /// <param name="output">The output for console/log</param> /// <returns>Built or updated web page repository</returns> public imbMCWebPage BuildWebPage(ISpiderTarget target, imbMCWebSite site, ILogBuilder output = null) { imbMCWebPage page = GetWebPage(site, target.url, true, output); ISpiderPage sPage = target.page; page.entry.AnchorTextAll = sPage.captions.toCsvInLine(","); page.entry.ClickDepth = sPage.iterationDiscovery; page.entry.ResolvedRelativeURL = site.domainInfo.GetURLWithoutDomainName(target.url); page.deploy(page.entry); page.indexEntry = target.GetIndexPage(); page.TextContent = target.pageText; page.name = target.page.name; var htmlDoc = target.GetHtmlDocument(); if (htmlDoc != null) { page.HtmlSourceCode = htmlDoc.DocumentNode.OuterHtml; // ; = target.contentBlocks; } else { } page.Blocks = new List <imbCommonModels.contentBlock.nodeBlock>(); page.TermTable = target.tokens.GetCompiledTable(output); target.contentBlocks.ForEach(x => page.Blocks.Add(x)); site.pageTable.AddOrUpdate(page.entry); page.SaveDataStructure(site.folder, output); return(page); }
/// <summary> /// Builds or updates web site repositorium using crawling information. /// </summary> /// <param name="targetCollection">Collection of SpiderTargets, populated by DLC crawl</param> /// <param name="domainInfo">DLC domain information</param> /// <param name="output">The output.</param> /// <returns> /// Reference to created or updated web site repository /// </returns> /// <remarks> /// This method uses completed DLC information to create <see cref="imbMCWebSite" /> repository and <see cref="imbMCWebPage" /> for all proper targets /// </remarks> public imbMCWebSite BuildWebSite(ISpiderTargetCollection targetCollection, domainAnalysis domainInfo, ILogBuilder output = null) { //Int32 siteCount = siteTable.Count; int pageCount = 0; imbMCWebSite repo = GetWebSite(domainInfo, true, output); pageCount = repo.pageTable.Count; if (pageCount == 0) { loger.log("Web site repository created [" + domainInfo.domainName + "]"); } List <ISpiderTarget> crawledTargets = targetCollection.GetLoaded(); foreach (ISpiderTarget target in crawledTargets) { if (isTargetProper(target)) { BuildWebPage(target, repo); } } int nPageCount = repo.pageTable.Count - pageCount; if (nPageCount > 0) { loger.log("Repository [" + domainInfo.domainName + "] expanded for [" + nPageCount + "] new pages, in total [" + (pageCount + nPageCount) + "] pages."); } siteTable.AddOrUpdate(repo.entry); repo.SaveDataStructure(folder, output); return(repo); }
/// <summary> /// Gets web site repositorium by clean domain name, like: "koplas.co.rs" for http://www.koplas.co.rs /// </summary> /// <param name="domainInfo">The domain information.</param> /// <param name="autoCreate">if set to <c>true</c> it will automatically create new entry and new repository</param> /// <param name="output">The log/diagnostic output</param> /// <returns></returns> public imbMCWebSite GetWebSite(domainAnalysis domainInfo, bool autoCreate = false, ILogBuilder output = null) { if (!autoCreate && (!siteTable.ContainsKey(domainInfo.domainRootName))) { return(null); } if (output == null) { output = aceLog.loger; } imbMCWebSiteEntry entry = siteTable.GetOrCreate(domainInfo.domainRootName); entry.domainProperUrl = domainInfo.urlProper; imbMCWebSite repo = entry.domain.LoadDataStructure <imbMCWebSite>(folder, output); repo.domainInfo = domainInfo; repo.deploy(entry); entry.domainProperUrl = repo.domainInfo.urlProper; siteTable.AddOrUpdate(entry); if (repo.folder == null) { if (output != null) { output.log("Warning: folder instance is null in web site repo [" + repo.name + "]"); } } return(repo); }