Esempio n. 1
0
        //request.doContentCheck = false;
        //request.doCoolOff = false;
        //request.doRetryExecution = false;
        //request.doSubdomainVariations = false;
        //request.doTimeoutLimiter = true;

        //request.doLogCacheLoaded = imbWEMManager.settings.executionLog.doPageLoadedFromCache;
        //request.doLogNewLoad = imbWEMManager.settings.executionLog.doPageLoadedLog;
        //request.doLogRequestError = imbWEMManager.settings.executionLog.doPageErrorOrDuplicateLog;

        //request.htmlSettings.doTransliterateToLat = false;
        //request.htmlSettings.doRemoveHtmlEntities = true;
        //request.htmlSettings.doUpperCase = true;
        //request.htmlSettings.doAutocloseOnEnd = true;

        /// <summary>
        /// Does the web request
        /// </summary>
        /// <param name="url">The URL.</param>
        /// <param name="crawlerContext">The crawler context.</param>
        /// <returns></returns>
        internal crawledPage doWebRequest(string url, modelSpiderPageRecord pRecord)
        {
            url = controler.GetDuplicateUrl(url);

            if (url.isNullOrEmpty())
            {
                imbWEMManager.log.log("EMPTY URL PASSED TO THE WEB LOADER");
                imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(2200);
            }

            loaderRequest wemRequest = new loaderRequest(url);

            if (controler.CheckFail(wemRequest.url))
            {
                wemRequest.executed   = true;
                wemRequest.statusCode = System.Net.HttpStatusCode.ExpectationFailed;
            }
            else
            {
                wemRequest = loaderSubsystem.ExecuteRequest(wemRequest);   // <-----------------------------
                if (wemRequest.statusCode != System.Net.HttpStatusCode.OK)
                {
                    controler.SetFailUrl(wemRequest.url);
                }
            }


            if (dataLoad != null)
            {
                dataLoad.AddBytes(wemRequest.byteSize);
            }


            crawledPage page = makeCrawledPage(wemRequest, pRecord); // <-----------------------------[ STIZE DO OVDE



            return(page); // <---- prolazi
        }
Esempio n. 2
0
        //  public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, crawlerAgentContext crawlerContext, Boolean __doTokenization, modelSpiderSiteRecord wRecord)

        /// <summary>
        /// Runs the spider task item.
        /// </summary>
        /// <param name="ln">The ln.</param>
        /// <param name="crawlerContext">The crawler context.</param>
        /// <param name="__doTokenization">if set to <c>true</c> [do tokenization].</param>
        /// <param name="wRecord">The w record.</param>
        /// <returns></returns>
        public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, bool __doTokenization, modelSpiderPageRecord pRecord)
        {
            spiderTaskResultItem rItem = new spiderTaskResultItem(ln);

            crawledPage page = null;

            page = doWebRequest(ln.url, pRecord); // < ----------------------- ovde puca

            rItem.finish(page, pRecord.iteration);

            if (page.status == pageStatus.failed)
            {
                return(rItem);
            }


            pRecord.acceptPage(page);


            pRecord.init(rItem.sPage);

            return(rItem); // <---------------------------------------------- [ prolazi
        }
Esempio n. 3
0
        /// <summary>
        /// Makes the crawled page.
        /// </summary>
        /// <param name="result">The result.</param>
        /// <param name="pRecord">The p record.</param>
        /// <returns></returns>
        /// <exception cref="aceGeneralException">Error in link processing</exception>
        internal crawledPage makeCrawledPage(IWebResult result, modelSpiderPageRecord pRecord)
        {
            crawledPage page = new crawledPage(result.responseUrl, 0);

            page.result = result;
            page.domain = pRecord.wRecord.domainInfo.domainName;

            var links = result.HtmlDocument.DocumentNode.Descendants("a");

            if (links.Any())
            {
            }
            else
            {
            }


            foreach (HtmlNode hn in links)
            {
                try
                {
                    var  ndv = hn.CreateNavigator();
                    link l   = new link(ndv);
                    if (!l.isDefaultHomePage)
                    {
                        page.links.Add(l);
                    }
                }
                catch (Exception ex)
                {
                    throw new aceGeneralException(ex.Message, ex, page, "Error in link processing");
                }
            }

            if (page.links.Count == 0)
            {
            }

            var meta = result.HtmlDocument.DocumentNode.Descendants("meta");

            foreach (HtmlNode hn in meta)
            {
                String name    = hn.GetAttributeValue("name", "none");
                String content = hn.GetAttributeValue("content", "");

                switch (name)
                {
                case "keywords":
                    page.pageKeywords = content.SplitSmart(",", "", true, true);
                    break;

                case "description":
                    page.pageDescription = content;
                    break;
                }
            }

            var title = result.HtmlDocument.DocumentNode.Descendants("title").FirstOrDefault();

            if (title != null)
            {
                page.pageCaption = title.InnerText;
            }


            page.links.deployCollection(page);

            page.isCrawled = true;
            page.status    = pageStatus.loaded;

            if (!page.links.byScope[imbCommonModels.enums.linkScope.inner].Any())
            {
            }

            return(page);
        }
Esempio n. 4
0
        /// <summary>
        /// Runs the spider task.
        /// </summary>
        /// <param name="sTask">The s task.</param>
        /// <param name="crawlerContext">The crawler context.</param>
        /// <returns></returns>
        public spiderTaskResult runSpiderTask(spiderTask sTask, modelSpiderSiteRecord wRecord)
        {
            spiderTaskResult sResult = sTask.createResult();

            try
            {
                if (imbWEMManager.settings.crawlerJobEngine.crawlerDoParallelTaskLoads)
                {
                    Parallel.ForEach(sTask, ln =>
                    {
                        modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url);

                        spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord);

                        if (rItem.status != pageStatus.failed)
                        {
                            wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); // <-------------------------------- [ STIZE
                        }

                        sResult.AddResult(rItem);
                    });
                }
                else
                {
                    foreach (spiderLink ln in sTask)
                    {
                        modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url);

                        spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord);

                        if (rItem.status != pageStatus.failed)
                        {
                            wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount);
                        }

                        sResult.AddResult(rItem);
                    }
                }
            } catch (Exception ex)
            {
                imbWEMManager.log.log("runSpiderTask exception: " + ex.Message);
            }

            loadIndex = loadIndex + sResult.Count();


            if (loadIndex > imbWEMManager.settings.crawlerJobEngine.loadCountForGC)
            {
                long mem = GC.GetTotalMemory(false);
                GC.Collect();
                GC.WaitForFullGCComplete();
                long dmem = GC.GetTotalMemory(false);

                aceLog.log("Memory allocation reduction [after " + loadIndex + " tasks]: " + (mem - dmem).getMByteCountFormated());
                loadIndex = 0;
            }



            sResult.finish();

            return(sResult);
        }