/// <summary>
        /// Gets the web page repository by resolved URL
        /// </summary>
        /// <param name="site">The site to query page from</param>
        /// <param name="url">The fully resolved URL.</param>
        /// <param name="autoCreate">if set to <c>true</c> [automatic create].</param>
        /// <param name="output">The output.</param>
        /// <returns></returns>
        /// <exception cref="ArgumentException">Url must be in full and resolved form, and must come from the same root domain name (different TLD is allowed) - url</exception>
        public imbMCWebPage GetWebPage(imbMCWebSite site, string url, bool autoCreate = false, ILogBuilder output = null)
        {
            if (output == null)
            {
                output = aceLog.loger;
            }


            if (!url.Contains(site.domainInfo.domainRootName))
            {
                if (doThrowOnDomainMismatch)
                {
                    throw new ArgumentException($"Url [{url}] must be in full and resolved form, and must come from the same root domain name [{site.entry.domain}] (different TLD is allowed)", nameof(url));
                }
            }

            string HashCode = md5.GetMd5Hash(url);


            if (!autoCreate && (!site.pageTable.ContainsKey(HashCode)))
            {
                output.log($"Page repository {HashCode} (for: {url}) not found at {site.folder.path}");
                return(null);
            }

            var entry = site.pageTable.GetOrCreate(HashCode);

            imbMCWebPage repo = HashCode.LoadDataStructure <imbMCWebPage>(site.folder, output);

            repo.deploy(entry);

            site.pageTable.AddOrUpdate(entry);

            return(repo);
        }
        private void onTargetPageAttached(modelSpiderSiteRecord __wRecord, modelSpiderSiteRecordEventArgs __args)
        {
            imbMCRepository mcRepo = mcm.activeRepository;
            imbMCWebSite    wRepo  = webSiteReposByDomain[__wRecord.domain];

            ISpiderTarget target = __args.Target;

            if (mcRepo.isTargetProper(target))
            {
                imbMCWebPage pRepo = mcRepo.BuildWebPage(target, wRepo, loger);
                pRepo.indexEntry     = imbWEMManager.index.pageIndexTable.GetPageForUrl(target.url);
                pRepo.HtmlSourceCode = __args.sourceHtml;
                pRepo.XmlSourceCode  = __args.sourceXml;

                pRepo.SaveDataStructure(wRepo.folder, loger);
            }
        }
        /// <summary>
        /// Builds the web page repository using <see cref="ISpiderTarget"/> crawl information
        /// </summary>
        /// <param name="target">Target information</param>
        /// <param name="site">The site to build page for</param>
        /// <param name="output">The output for console/log</param>
        /// <returns>Built or updated web page repository</returns>
        public imbMCWebPage BuildWebPage(ISpiderTarget target, imbMCWebSite site, ILogBuilder output = null)
        {
            imbMCWebPage page  = GetWebPage(site, target.url, true, output);
            ISpiderPage  sPage = target.page;

            page.entry.AnchorTextAll       = sPage.captions.toCsvInLine(",");
            page.entry.ClickDepth          = sPage.iterationDiscovery;
            page.entry.ResolvedRelativeURL = site.domainInfo.GetURLWithoutDomainName(target.url);

            page.deploy(page.entry);

            page.indexEntry = target.GetIndexPage();

            page.TextContent = target.pageText;
            page.name        = target.page.name;

            var htmlDoc = target.GetHtmlDocument();

            if (htmlDoc != null)
            {
                page.HtmlSourceCode = htmlDoc.DocumentNode.OuterHtml; // ; = target.contentBlocks;
            }
            else
            {
            }

            page.Blocks = new List <imbCommonModels.contentBlock.nodeBlock>();


            page.TermTable = target.tokens.GetCompiledTable(output);

            target.contentBlocks.ForEach(x => page.Blocks.Add(x));

            site.pageTable.AddOrUpdate(page.entry);

            page.SaveDataStructure(site.folder, output);

            return(page);
        }
Exemple #4
0
 public int SortByPageSize(imbMCWebPage page1, imbMCWebPage page2)
 {
     return(page1.TextContent.Length.CompareTo(page2.TextContent.Length));
 }
Exemple #5
0
        /// <summary>
        /// Gets the textual extract.
        /// </summary>
        /// <param name="source">The source.</param>
        /// <returns></returns>
        public static String GetTextualExtract(this imbMCWebPage source)
        {
            String extract = source.TextContent.Replace(Environment.NewLine, "").Replace(" ", "").ToLower();

            return(extract);
        }