private void onTargetPageAttached(modelSpiderSiteRecord __wRecord, modelSpiderSiteRecordEventArgs __args) { imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = webSiteReposByDomain[__wRecord.domain]; ISpiderTarget target = __args.Target; if (mcRepo.isTargetProper(target)) { imbMCWebPage pRepo = mcRepo.BuildWebPage(target, wRepo, loger); pRepo.indexEntry = imbWEMManager.index.pageIndexTable.GetPageForUrl(target.url); pRepo.HtmlSourceCode = __args.sourceHtml; pRepo.XmlSourceCode = __args.sourceXml; pRepo.SaveDataStructure(wRepo.folder, loger); } }
/// <summary> /// Builds the web page repository using <see cref="ISpiderTarget"/> crawl information /// </summary> /// <param name="target">Target information</param> /// <param name="site">The site to build page for</param> /// <param name="output">The output for console/log</param> /// <returns>Built or updated web page repository</returns> public imbMCWebPage BuildWebPage(ISpiderTarget target, imbMCWebSite site, ILogBuilder output = null) { imbMCWebPage page = GetWebPage(site, target.url, true, output); ISpiderPage sPage = target.page; page.entry.AnchorTextAll = sPage.captions.toCsvInLine(","); page.entry.ClickDepth = sPage.iterationDiscovery; page.entry.ResolvedRelativeURL = site.domainInfo.GetURLWithoutDomainName(target.url); page.deploy(page.entry); page.indexEntry = target.GetIndexPage(); page.TextContent = target.pageText; page.name = target.page.name; var htmlDoc = target.GetHtmlDocument(); if (htmlDoc != null) { page.HtmlSourceCode = htmlDoc.DocumentNode.OuterHtml; // ; = target.contentBlocks; } else { } page.Blocks = new List <imbCommonModels.contentBlock.nodeBlock>(); page.TermTable = target.tokens.GetCompiledTable(output); target.contentBlocks.ForEach(x => page.Blocks.Add(x)); site.pageTable.AddOrUpdate(page.entry); page.SaveDataStructure(site.folder, output); return(page); }