Пример #1
0
        protected spiderTask __operation_GetLoadTaskCommon(modelSpiderSiteRecord wRecord, IEnumerable <spiderLink> activeLinks)
        {
            operation_doControlAndStats(wRecord);
            // <------------------------------------------------------------------------------------------


            int n = wRecord.context.GetNextIterationLTSize(activeLinks);



            // wRecord.logBuilder.log("Creating new spiderTask for iteration " + wRecord.iteration + " with " + n + " links to load. To Limit: " + toLimit);

            spiderTask outputTask = new spiderTask(wRecord.iteration + 1, wRecord.web);

            outputTask.AddRange(activeLinks.Take(n));

            foreach (var ali in activeLinks)
            {
                if (!outputTask.Contains(ali))
                {
                    ali.marks.cycleRegistration(wRecord.iteration);
                }
            }


            return(outputTask);
        }
Пример #2
0
        /// <summary>
        /// Creates single web loading task
        /// </summary>
        /// <param name="lnk">The LNK.</param>
        /// <param name="sReport">The s report.</param>
        /// <param name="iteration">The iteration.</param>
        /// <returns></returns>
        public virtual spiderTask getSpiderSingleTask(spiderLink lnk, modelSpiderSiteRecord sReport, int iteration)
        {
            spiderTask output = new spiderTask(iteration, sReport.web);

            // output.doTokenization = flags.HasFlag(spiderEvaluatorExecutionFlags.doTokenization);

            output.Add(lnk);

            return(output);
        }
        public override spiderTask operation_GetLoadTask(modelSpiderSiteRecord wRecord)
        {
            //base.operation_GetLoadTask(wRecord);

            //operation_doControlAndStats(wRecord);

            //Int32 toLimit = settings.limitTotalPageLoad - (wRecord.context.targets.GetLoaded().Count - wRecord.duplicateCount);

            //Int32 n = Math.Min(wRecord.currentModuleData.active.Count, settings.limitIterationNewLinks); //, untillLimit);
            //n = Math.Min(n, toLimit);

            //wRecord.logBuilder.log("Creating new spiderTask for iteration " + wRecord.iteration + " with " + n + " links to load. Pageloads until limit: " + toLimit);



            spiderTask outputTask = __operation_GetLoadTaskCommon(wRecord, wRecord.currentModuleData.active);



            int c = 0;

            foreach (var task in wRecord.currentModuleData.active)
            {
                string lAge   = task.linkAge.ToString("D2");
                string lUrl   = task.url;
                string lScore = task.marks.calculate(wRecord.iteration).ToString(); //.score.ToString();

                string lineFormat = c.ToString("D2") + " {0,4} | {1, 30} | {2,6}";

                if (outputTask.Contains(task))
                {
                    lineFormat += " (selected)";
                }

                wRecord.logBuilder.AppendLine(string.Format(lineFormat, lAge, lUrl, lScore));
                c++;
            }


            return(outputTask);
        }
Пример #4
0
        /// <summary>
        /// Runs the spider task.
        /// </summary>
        /// <param name="sTask">The s task.</param>
        /// <param name="crawlerContext">The crawler context.</param>
        /// <returns></returns>
        public spiderTaskResult runSpiderTask(spiderTask sTask, modelSpiderSiteRecord wRecord)
        {
            spiderTaskResult sResult = sTask.createResult();

            try
            {
                if (imbWEMManager.settings.crawlerJobEngine.crawlerDoParallelTaskLoads)
                {
                    Parallel.ForEach(sTask, ln =>
                    {
                        modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url);

                        spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord);

                        if (rItem.status != pageStatus.failed)
                        {
                            wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); // <-------------------------------- [ STIZE
                        }

                        sResult.AddResult(rItem);
                    });
                }
                else
                {
                    foreach (spiderLink ln in sTask)
                    {
                        modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url);

                        spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord);

                        if (rItem.status != pageStatus.failed)
                        {
                            wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount);
                        }

                        sResult.AddResult(rItem);
                    }
                }
            } catch (Exception ex)
            {
                imbWEMManager.log.log("runSpiderTask exception: " + ex.Message);
            }

            loadIndex = loadIndex + sResult.Count();


            if (loadIndex > imbWEMManager.settings.crawlerJobEngine.loadCountForGC)
            {
                long mem = GC.GetTotalMemory(false);
                GC.Collect();
                GC.WaitForFullGCComplete();
                long dmem = GC.GetTotalMemory(false);

                aceLog.log("Memory allocation reduction [after " + loadIndex + " tasks]: " + (mem - dmem).getMByteCountFormated());
                loadIndex = 0;
            }



            sResult.finish();

            return(sResult);
        }
Пример #5
0
        public bool CheckStage(modelSpiderSiteRecord wRecord, spiderObjectiveSolutionSet oSet, spiderTask task)
        {
            bool okToLeave = false;

            if (task.Count() == 0)
            {
                wRecord.logBuilder.log("> Spider task [i:" + task.iteration + "] have no tasks defined. Aborting the stage loop.");
                okToLeave = true;
                return(okToLeave);
            }

            // <----------------------------- OBJECTIVE SOLUTION SET
            okToLeave = operation_executeObjectiveSolutionSet(oSet, wRecord);
            if (okToLeave)
            {
                return(okToLeave);
            }

            // <----------------------------- SPIDER LIMITS OVERRIDERS ---------------|
            if (stageIteration > wRecord.tRecord.instance.settings.limitIterations)
            {
                wRecord.log("> Spider settings (limit iterations) trigered abort at [" + stageIteration + "] Aborting the stage loop.");
                okToLeave = true;
                return(okToLeave);
            }
            // <----------------------------------------------------------------------|

            // <----------------------------- SPIDER LIMITS OVERRIDERS ---------------|
            if (wRecord.web.webPages.Count() > wRecord.tRecord.instance.settings.limitTotalPageLoad)
            {
                wRecord.log("> Spider settings (limit pages load) trigered abort at [" + wRecord.web.webPages.Count() + "] Aborting the stage loop.");
                okToLeave = true;
                return(okToLeave);
            }
            // <----------------------------------------------------------------------|



            if (stageIteration > stageIterationLimit)
            {
                wRecord.logBuilder.log("> Stage [" + name + "] iteration limit reached [ " + stageIterationLimit + " ] -- aborting [" + objectives.Count + "] objectives and move on");
                okToLeave = true;
                return(okToLeave);
            }

            if (stageIteration > GLOBAL_stageIterationLimit)
            {
                Exception ex = new aceGeneralException("spiderStage [" + name + "] reached the " + nameof(GLOBAL_stageIterationLimit) + "(" + GLOBAL_stageIterationLimit.ToString() + ")");
                throw ex;
            }

            stageIteration++;



            return(okToLeave);
        }
Пример #6
0
        /// <summary>
        /// Starts this instance.
        /// </summary>
        public void start()
        {
            iterationStatus = crawlerDomainTaskIterationPhase.iterationProcessNotStarted;
            status          = crawlerDomainTaskStatusEnum.working;

            executionThread = Thread.CurrentThread;


            if (status == crawlerDomainTaskStatusEnum.aborted)
            {
                aceLog.log("Aborted DomainTask --> start()");
                return;
            }



            lastIterationStart = DateTime.Now;
            startTime          = DateTime.Now;


            aceLog.consoleControl.setAsOutput(wRecord, "" + wProfile.domain);



            parent.parent.reportPlugins.eventUniversal(crawlReportingStageEnum.DLCPreinitiation, reporter, this, wRecord);



            try
            {
                iterationStatus = crawlerDomainTaskIterationPhase.loadingSeedPage;
                // <--- STAGE 1

                spiderWebLoader loader = new spiderWebLoader(parent.parent.dataLoadTaker);
                loader.controler = parent.parent.webLoaderControler;
                stageControl.prepare();


                spiderTask sTask = evaluator.getSpiderSingleTask(web.seedLink, wRecord, 1); // <--------

                spiderTaskResult sResult = loader.runSpiderTask(sTask, wRecord);            // <--------------------------------------------------------[ izvršava

                if (sResult.calculateSuccessRate() == 0)
                {
                    wRecord.log("Domain [" + wRecord.domain + "] is considered as failed since landing page load failed");
                    parent.parent.webLoaderControler.SetFailedDomain(wProfile, wRecord);
                }


                spiderObjectiveSolutionSet solSet = null;


                stageControl.stage.EnterStage(wRecord, evaluator);



                parent.parent.plugins.eventDLCInitiated(parent.parent, this, wRecord); //.eventDLCFinished(parent.parent, this, wRecord);
                evaluator.plugins.eventDLCInitiated(evaluator as spiderEvaluatorBase, this, wRecord);
                imbWEMManager.index.plugins.eventDLCInitiated(imbWEMManager.index.experimentEntry, this, wRecord);
                parent.parent.reportPlugins.eventDLCInitiated(reporter, this, wRecord);


                int lastTermCount = 0;

                // <--- STAGE 2
                do
                {
                    iterationStatus = crawlerDomainTaskIterationPhase.iterationStart;

                    lastIterationStart = DateTime.Now;

                    dataUnitSpiderIteration iDataUnit = wRecord.timeseries.CreateEntry(null, sTask.iteration);


                    iterationStatus = crawlerDomainTaskIterationPhase.receiveResult;

                    if (imbWEMManager.MASTERKILL_SWITCH)
                    {
                        aceLog.log("MASTERKILL SWITCH ON :: crawlerDomainTask->" + iterationStatus.ToString());
                        isStageAborted = true;
                        sResult.items.Clear();
                        sResult.task.Clear();
                        evaluator.settings.limitIterations    = wRecord.iteration - 5;
                        evaluator.settings.limitTotalPageLoad = 0;


                        Closing();
                        return;
                    }


                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }



                    evaluator.plugins.processLoaderResult(sResult, wRecord, this);

                    // wRecord.context.targets.termsAll.Count();

                    var iter = wRecord.iterationTableRecord.GetLastEntryTouched();
                    if (iter != null)
                    {
                        lastTermCount = iter.terms_all;
                    }

                    evaluator.operation_receiveResult(sResult, wRecord);

                    // __tc = wRecord.context.targets.termsAll.Count() - __tc;



                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }

                    iterationStatus = crawlerDomainTaskIterationPhase.applyLinkRules;
                    evaluator.plugins.processAfterResultReceived(wRecord, this);
                    solSet = evaluator.operation_applyLinkRules(wRecord);

                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }

                    iterationStatus = crawlerDomainTaskIterationPhase.getLoadTask;
                    sTask           = evaluator.operation_GetLoadTask(wRecord);

                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }

                    iterationStatus = crawlerDomainTaskIterationPhase.loadingTask;
                    if (isLoaderDisabled)
                    {
                        wRecord.log("-- Loader component is disabled for this [" + wRecord.domain + "] task.");
                        sResult = new spiderTaskResult();
                    }
                    else
                    {
                        sResult = loader.runSpiderTask(sTask, wRecord);
                    }
                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }


                    parent.parent.dataLoadTaker.AddIteration();

                    iterationStatus = crawlerDomainTaskIterationPhase.updatingData;

                    if (evaluator.settings.doEnableCrossLinkDetection)
                    {
                        evaluator.operation_detectCrossLinks(wRecord);
                    }

                    iDataUnit.checkData();

                    targetLoaded   = iDataUnit.tc_loaded_p;
                    targetDetected = iDataUnit.tc_detected_p;



                    if (reporter != null)
                    {
                        try {
                            int lTC   = 0;
                            var iter2 = wRecord.iterationTableRecord.GetLastEntryTouched();
                            if (iter2 != null)
                            {
                                lTC = iter2.terms_all - lastTermCount;
                            }

                            reporter.reportIteration(iDataUnit, wRecord, evaluator); // <------ ovde se kreira nova iteracija
                            imbWEMManager.index.plugins.eventIteration(imbWEMManager.index.experimentEntry, this, wRecord);


                            parent.parent.dataLoadTaker.AddContentPage(lTC, sResult.Count);
                        }
                        catch (Exception ex)
                        {
                            throw new aceGeneralException(ex.Message, ex, reporter, "Reporter.reportIteration() exception");
                        }
                    }

                    parent.parent.reportPlugins.eventIteration(evaluator, this, wRecord);


                    iterationStatus = crawlerDomainTaskIterationPhase.checkingRules;

                    if (targetLoaded >= evaluator.settings.limitTotalPageLoad)
                    {
                        isStageAborted = true;
                        wRecord.log("--- Loaded pages count meet limit [" + targetLoaded + "] on iteration [" + iDataUnit.iteration + "].");
                    }

                    if (iDataUnit.iteration >= evaluator.settings.limitIterations)
                    {
                        isStageAborted = true;
                        wRecord.log("--- Iteration limit reached [" + iDataUnit.iteration + "].");
                    }


                    if (DateTime.Now.Subtract(startTime).TotalMinutes >= parent.parent._timeLimitForDLC)
                    {
                        isStageAborted = true;
                        wRecord.log("--- Timeout : crawler domain task [" + wRecord.web.seedLink.url + "] aborted after [" + DateTime.Now.Subtract(startTime).TotalMinutes + "] minutes.");
                    }

                    if (isStageAborted)
                    {
                        break;
                    }
                } while ((!stageControl.stage.CheckStage(wRecord, solSet, sTask)) && !isStageAborted);
                iterationStatus = crawlerDomainTaskIterationPhase.pageEvaluation;

                // <---- STAGE 3
                wRecord.resultPageSet = evaluator.operation_evaluatePages(wRecord);

                Closing();
            } catch (Exception ex)
            {
                crawlerErrorEnum errorType = crawlerErrorEnum.domainTaskError;

                switch (iterationStatus)
                {
                case crawlerDomainTaskIterationPhase.applyLinkRules:
                    errorType = crawlerErrorEnum.spiderModuleError;
                    break;

                case crawlerDomainTaskIterationPhase.getLoadTask:
                    errorType = crawlerErrorEnum.spiderGetTaskError;
                    break;

                case crawlerDomainTaskIterationPhase.loadingTask:
                    errorType = crawlerErrorEnum.spiderLoadingError;
                    break;

                case crawlerDomainTaskIterationPhase.pageEvaluation:
                    errorType = crawlerErrorEnum.spiderModuleError;
                    break;
                }

                string domainName = wRecord.domainInfo.domainName;

                if (!tRecord.crashedDomains.Contains(domainName))
                {
                    wRecord.log("Domain crashed first time: " + ex.Message);
                    aceLog.log("Domain [" + domainName + "] crashed first time: " + ex.Message);
                    aceLog.log("Domain [" + domainName + "] is restarting... ");
                    status = crawlerDomainTaskStatusEnum.waiting;
                    tRecord.crashedDomains.Add(wRecord.domainInfo.domainName);
                    reInitialization();
                    start();
                }
                else
                {
                    status = crawlerDomainTaskStatusEnum.aborted;

                    wRecord.log("Aborted by execution exception: " + ex.Message);
                }

                var clog = reporter.CreateAndSaveError(ex, wRecord, this, errorType);
                wRecord.log(clog.Message);
                //  crawlerErrorLog cel = new crawlerErrorLog(ex, wRecord, this, errorType);
            } finally
            {
            }

            aceLog.consoleControl.removeFromOutput(wRecord); //, "sp:" + tRecord.instance.name);
        }