Ejemplo n.º 1
0
        /// <summary>
        /// Gets the web page repository by resolved URL
        /// </summary>
        /// <param name="site">The site to query page from</param>
        /// <param name="url">The fully resolved URL.</param>
        /// <param name="autoCreate">if set to <c>true</c> [automatic create].</param>
        /// <param name="output">The output.</param>
        /// <returns></returns>
        /// <exception cref="ArgumentException">Url must be in full and resolved form, and must come from the same root domain name (different TLD is allowed) - url</exception>
        public imbMCWebPage GetWebPage(imbMCWebSite site, string url, bool autoCreate = false, ILogBuilder output = null)
        {
            if (output == null)
            {
                output = aceLog.loger;
            }


            if (!url.Contains(site.domainInfo.domainRootName))
            {
                if (doThrowOnDomainMismatch)
                {
                    throw new ArgumentException($"Url [{url}] must be in full and resolved form, and must come from the same root domain name [{site.entry.domain}] (different TLD is allowed)", nameof(url));
                }
            }

            string HashCode = md5.GetMd5Hash(url);


            if (!autoCreate && (!site.pageTable.ContainsKey(HashCode)))
            {
                output.log($"Page repository {HashCode} (for: {url}) not found at {site.folder.path}");
                return(null);
            }

            var entry = site.pageTable.GetOrCreate(HashCode);

            imbMCWebPage repo = HashCode.LoadDataStructure <imbMCWebPage>(site.folder, output);

            repo.deploy(entry);

            site.pageTable.AddOrUpdate(entry);

            return(repo);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Builds the web page repository using <see cref="ISpiderTarget"/> crawl information
        /// </summary>
        /// <param name="target">Target information</param>
        /// <param name="site">The site to build page for</param>
        /// <param name="output">The output for console/log</param>
        /// <returns>Built or updated web page repository</returns>
        public imbMCWebPage BuildWebPage(ISpiderTarget target, imbMCWebSite site, ILogBuilder output = null)
        {
            imbMCWebPage page  = GetWebPage(site, target.url, true, output);
            ISpiderPage  sPage = target.page;

            page.entry.AnchorTextAll       = sPage.captions.toCsvInLine(",");
            page.entry.ClickDepth          = sPage.iterationDiscovery;
            page.entry.ResolvedRelativeURL = site.domainInfo.GetURLWithoutDomainName(target.url);

            page.deploy(page.entry);

            page.indexEntry = target.GetIndexPage();

            page.TextContent = target.pageText;
            page.name        = target.page.name;

            var htmlDoc = target.GetHtmlDocument();

            if (htmlDoc != null)
            {
                page.HtmlSourceCode = htmlDoc.DocumentNode.OuterHtml; // ; = target.contentBlocks;
            }
            else
            {
            }

            page.Blocks = new List <imbCommonModels.contentBlock.nodeBlock>();


            page.TermTable = target.tokens.GetCompiledTable(output);

            target.contentBlocks.ForEach(x => page.Blocks.Add(x));

            site.pageTable.AddOrUpdate(page.entry);

            page.SaveDataStructure(site.folder, output);

            return(page);
        }