/// <summary> /// Gets the web page repository by resolved URL /// </summary> /// <param name="site">The site to query page from</param> /// <param name="url">The fully resolved URL.</param> /// <param name="autoCreate">if set to <c>true</c> [automatic create].</param> /// <param name="output">The output.</param> /// <returns></returns> /// <exception cref="ArgumentException">Url must be in full and resolved form, and must come from the same root domain name (different TLD is allowed) - url</exception> public imbMCWebPage GetWebPage(imbMCWebSite site, string url, bool autoCreate = false, ILogBuilder output = null) { if (output == null) { output = aceLog.loger; } if (!url.Contains(site.domainInfo.domainRootName)) { if (doThrowOnDomainMismatch) { throw new ArgumentException($"Url [{url}] must be in full and resolved form, and must come from the same root domain name [{site.entry.domain}] (different TLD is allowed)", nameof(url)); } } string HashCode = md5.GetMd5Hash(url); if (!autoCreate && (!site.pageTable.ContainsKey(HashCode))) { output.log($"Page repository {HashCode} (for: {url}) not found at {site.folder.path}"); return(null); } var entry = site.pageTable.GetOrCreate(HashCode); imbMCWebPage repo = HashCode.LoadDataStructure <imbMCWebPage>(site.folder, output); repo.deploy(entry); site.pageTable.AddOrUpdate(entry); return(repo); }
/// <summary> /// Builds the web page repository using <see cref="ISpiderTarget"/> crawl information /// </summary> /// <param name="target">Target information</param> /// <param name="site">The site to build page for</param> /// <param name="output">The output for console/log</param> /// <returns>Built or updated web page repository</returns> public imbMCWebPage BuildWebPage(ISpiderTarget target, imbMCWebSite site, ILogBuilder output = null) { imbMCWebPage page = GetWebPage(site, target.url, true, output); ISpiderPage sPage = target.page; page.entry.AnchorTextAll = sPage.captions.toCsvInLine(","); page.entry.ClickDepth = sPage.iterationDiscovery; page.entry.ResolvedRelativeURL = site.domainInfo.GetURLWithoutDomainName(target.url); page.deploy(page.entry); page.indexEntry = target.GetIndexPage(); page.TextContent = target.pageText; page.name = target.page.name; var htmlDoc = target.GetHtmlDocument(); if (htmlDoc != null) { page.HtmlSourceCode = htmlDoc.DocumentNode.OuterHtml; // ; = target.contentBlocks; } else { } page.Blocks = new List <imbCommonModels.contentBlock.nodeBlock>(); page.TermTable = target.tokens.GetCompiledTable(output); target.contentBlocks.ForEach(x => page.Blocks.Add(x)); site.pageTable.AddOrUpdate(page.entry); page.SaveDataStructure(site.folder, output); return(page); }