public void get_links(int depth2, string queue)
            {
                if (depth2 > BotWeb.Settings.Spider.MaxDownloadLinkDepth)
                {
                    return;
                }

                string domain     = Spider.GetDomain(hr.ResponseUrl);
                int    page_count = ((CustomSession)Session).domain2page_count[domain];

                if (BotWeb.Settings.Spider.MaxPageCountPerSite > -1 && page_count >= BotWeb.Settings.Spider.MaxPageCountPerSite)
                {
                    return;
                }
                string queue2 = domain + "-" + depth2.ToString();

                if (depth2 > 1)
                {
                    Session.SetInputItemQueuePositionAfterQueue(queue2, queue);//by default queue name is item type name but it can be different if needed
                }
                AgileSpider    ags = new AgileSpider(hr.ResponseUrl, hr.HtmlResult);
                List <WebLink> wls = ags.GetWebLinks(WebLinkType.Anchor | WebLinkType.Area | WebLinkType.Form | WebLinkType.MetaTag | WebLinkType.Frame | WebLinkType.Image | WebLinkType.Javascript);
                List <WebLink> beyond_domain_web_links;

                wls = Spider.GetSpiderableLinks(ags.BaseUri, wls, out beyond_domain_web_links);
                bool download = true;

                if (depth2 >= BotWeb.Settings.Spider.MaxDownloadLinkDepth)
                {
                    download = false;
                }
                foreach (WebLink wl in wls)
                {
                    Add(queue2, new Link(url: wl.Url, depth: depth2, download: download));
                    page_count++;
                    if (BotWeb.Settings.Spider.MaxPageCountPerSite > -1 && BotWeb.Settings.Spider.MaxPageCountPerSite <= page_count)
                    {
                        Log.Warning(domain + " reached MaxPageCountPerSite: " + BotWeb.Settings.Spider.MaxPageCountPerSite.ToString());
                        break;
                    }
                }
                ((CustomSession)Session).domain2page_count[domain] = page_count;
                foreach (WebLink wl in beyond_domain_web_links)
                {
                    Add(queue2, new Link(url: wl.Url, depth: depth2, download: false));//by default queue name is item type name but it can be deifferent if needed
                }
            }
Beispiel #2
0
        public void get_links(int depth2)
        {
            if (depth2 > Bot.Properties.Spider.Default.MaxDownloadLinkDepth)
            {
                return;
            }

            string domain     = Spider.GetDomain(hr.ResponseUrl);
            int    page_count = domain2page_count[domain];

            if (Bot.Properties.Spider.Default.MaxPageCountPerSite > -1 && page_count >= Bot.Properties.Spider.Default.MaxPageCountPerSite)
            {
                return;
            }

            AgileSpider    ags = new AgileSpider(hr.ResponseUrl, hr.HtmlResult);
            List <WebLink> wls = ags.GetWebLinks(WebLinkType.Anchor | WebLinkType.Area | WebLinkType.Form | WebLinkType.MetaTag | WebLinkType.Frame | WebLinkType.Image | WebLinkType.Javascript);
            List <WebLink> beyond_domain_web_links;

            wls = Spider.GetSpiderableLinks(ags.BaseUri, wls, out beyond_domain_web_links);
            bool download = true;

            if (depth2 >= Bot.Properties.Spider.Default.MaxDownloadLinkDepth)
            {
                download = false;
            }
            foreach (WebLink wl in wls)
            {
                BotCycle.Add(new Link(url: wl.Url, depth: depth2, download: download));
                page_count++;
                if (Bot.Properties.Spider.Default.MaxPageCountPerSite > -1 && Bot.Properties.Spider.Default.MaxPageCountPerSite <= page_count)
                {
                    Log.Warning(domain + " reached MaxPageCountPerSite: " + Bot.Properties.Spider.Default.MaxPageCountPerSite.ToString());
                    break;
                }
            }
            domain2page_count[domain] = page_count;
            foreach (WebLink wl in beyond_domain_web_links)
            {
                BotCycle.Add(new Link(url: wl.Url, depth: depth2, download: false));
            }
        }