/// <summary>
        /// E2: Applies passive link rules to new Active links
        /// </summary>
        /// <param name="wRecord">The s record.</param>
        public override spiderObjectiveSolutionSet operation_applyLinkRules(modelSpiderSiteRecord wRecord)
        {
            spiderObjectiveSolutionSet output = new spiderObjectiveSolutionSet();

            int c = 0;

            foreach (spiderLink sLink in Enumerable.Where(wRecord.web.webActiveLinks, x => !x.flags.HasFlag(spiderLinkFlags.passiveEvaluated)))
            {
                foreach (spiderEvalRuleForLinkBase rule in linkPassiveRules)
                {
                    sLink.marks.deploy(rule.evaluate(sLink));
                }
                c++;
                sLink.flags |= spiderLinkFlags.passiveEvaluated;
            }
            if (c > 0)
            {
                wRecord.log("Passive evaluation of [" + c + "] new links");
            }

            /// cleaning rule memory
            foreach (ruleActiveBase aRule in linkActiveRules)
            {
                aRule.startIteration(wRecord.iteration, wRecord);
            }

            /// perceiving current situation
            foreach (spiderLink sLink in wRecord.web.webActiveLinks)
            {
                sLink.linkAge++; // <---------------------------------------------------- adding link age points
                foreach (ruleActiveBase aRule in linkActiveRules)
                {
                    aRule.learn(sLink);
                }
            }

            /// apply update on results
            foreach (spiderLink sLink in wRecord.web.webActiveLinks)
            {
                foreach (ruleActiveBase aRule in linkActiveRules)
                {
                    sLink.marks.deploy(aRule.evaluate(sLink));
                }
                sLink.marks.calculate(wRecord.iteration);
            }


            // <----------------------------sorts the links
            wRecord.web.webActiveLinks.items.Sort((x, y) => y.marks.score.CompareTo(x.marks.score));


            foreach (controlObjectiveRuleBase aRule in controlRules)
            {
                aRule.startIteration(wRecord.iteration, wRecord);
                output.listen(aRule.evaluate(wRecord));
            }



            return(output);
        }
Esempio n. 2
0
        protected virtual void operation_doControlAndStats(modelSpiderSiteRecord wRecord)
        {
            var stats = wRecord.web.webActiveLinks.calculateTotalAndAvgScore();

            wRecord.timeseries[wRecord.iteration].avg_score_l = stats.Item2;
            //   wRecord.timeseries[wRecord.iteration].tc_detected_l = stats.Item1;


            // <---------------- Control rules
            spiderObjectiveSolutionSet output = new spiderObjectiveSolutionSet();

            /// cleaning rule memory
            foreach (controlLinkRuleBase aRule in controlLinkRules)
            {
                aRule.startIteration(wRecord.iteration, wRecord);
            }

            /// perceiving current situation
            foreach (spiderLink sLink in wRecord.web.webActiveLinks)
            {
                foreach (controlLinkRuleBase aRule in controlLinkRules)
                {
                    aRule.learn(sLink, wRecord);
                }
            }


            /// --------- TRIM BELOW ZERO ------------ ///
            if (settings.FRONTIER_PullDecayModes.HasFlag(spiderPullDecayModes.belowZeroScoreRemoval))
            {
                foreach (spiderLink sLink in wRecord.web.webActiveLinks.ToList())
                {
                    if (sLink.marks.score < 0)
                    {
                        wRecord.web.webActiveLinks.Remove(sLink);
                        wRecord.log("Link [" + sLink.url + "] had score below zero");
                    }
                }
            }

            /// apply update on results
            foreach (spiderLink sLink in wRecord.web.webActiveLinks)
            {
                foreach (controlLinkRuleBase aRule in controlLinkRules)
                {
                    output.listen(aRule.evaluate(sLink, wRecord));
                }
            }

            int removed = 0;

            foreach (var link in output.links)
            { // <--------------------------------------------------- removes any links found at control solution set
                wRecord.web.webActiveLinks.Remove(link);
                removed++;
            }


            if (removed > 0)
            {
                wRecord.log("Control rules removed: " + removed.ToString() + " links from active links collection");
            }


            //wRecord.logBuilder.log("Link drop-out:" + output.links.Count + ". Now have " + wRecord.web.webActiveLinks.Count() + " links waiting.");

            stats = wRecord.web.webActiveLinks.items.calculateTotalAndAvgScore();
            wRecord.timeseries[wRecord.iteration].avg_scoreADO_l = stats.Item2;
            wRecord.timeseries[wRecord.iteration].tc_scoreADO_l  = stats.Item1;
            wRecord.timeseries[wRecord.iteration].nw_ruledout_l  = output.links.Count;
        }
Esempio n. 3
0
        /// <summary>
        /// Starts this instance.
        /// </summary>
        public void start()
        {
            iterationStatus = crawlerDomainTaskIterationPhase.iterationProcessNotStarted;
            status          = crawlerDomainTaskStatusEnum.working;

            executionThread = Thread.CurrentThread;


            if (status == crawlerDomainTaskStatusEnum.aborted)
            {
                aceLog.log("Aborted DomainTask --> start()");
                return;
            }



            lastIterationStart = DateTime.Now;
            startTime          = DateTime.Now;


            aceLog.consoleControl.setAsOutput(wRecord, "" + wProfile.domain);



            parent.parent.reportPlugins.eventUniversal(crawlReportingStageEnum.DLCPreinitiation, reporter, this, wRecord);



            try
            {
                iterationStatus = crawlerDomainTaskIterationPhase.loadingSeedPage;
                // <--- STAGE 1

                spiderWebLoader loader = new spiderWebLoader(parent.parent.dataLoadTaker);
                loader.controler = parent.parent.webLoaderControler;
                stageControl.prepare();


                spiderTask sTask = evaluator.getSpiderSingleTask(web.seedLink, wRecord, 1); // <--------

                spiderTaskResult sResult = loader.runSpiderTask(sTask, wRecord);            // <--------------------------------------------------------[ izvršava

                if (sResult.calculateSuccessRate() == 0)
                {
                    wRecord.log("Domain [" + wRecord.domain + "] is considered as failed since landing page load failed");
                    parent.parent.webLoaderControler.SetFailedDomain(wProfile, wRecord);
                }


                spiderObjectiveSolutionSet solSet = null;


                stageControl.stage.EnterStage(wRecord, evaluator);



                parent.parent.plugins.eventDLCInitiated(parent.parent, this, wRecord); //.eventDLCFinished(parent.parent, this, wRecord);
                evaluator.plugins.eventDLCInitiated(evaluator as spiderEvaluatorBase, this, wRecord);
                imbWEMManager.index.plugins.eventDLCInitiated(imbWEMManager.index.experimentEntry, this, wRecord);
                parent.parent.reportPlugins.eventDLCInitiated(reporter, this, wRecord);


                int lastTermCount = 0;

                // <--- STAGE 2
                do
                {
                    iterationStatus = crawlerDomainTaskIterationPhase.iterationStart;

                    lastIterationStart = DateTime.Now;

                    dataUnitSpiderIteration iDataUnit = wRecord.timeseries.CreateEntry(null, sTask.iteration);


                    iterationStatus = crawlerDomainTaskIterationPhase.receiveResult;

                    if (imbWEMManager.MASTERKILL_SWITCH)
                    {
                        aceLog.log("MASTERKILL SWITCH ON :: crawlerDomainTask->" + iterationStatus.ToString());
                        isStageAborted = true;
                        sResult.items.Clear();
                        sResult.task.Clear();
                        evaluator.settings.limitIterations    = wRecord.iteration - 5;
                        evaluator.settings.limitTotalPageLoad = 0;


                        Closing();
                        return;
                    }


                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }



                    evaluator.plugins.processLoaderResult(sResult, wRecord, this);

                    // wRecord.context.targets.termsAll.Count();

                    var iter = wRecord.iterationTableRecord.GetLastEntryTouched();
                    if (iter != null)
                    {
                        lastTermCount = iter.terms_all;
                    }

                    evaluator.operation_receiveResult(sResult, wRecord);

                    // __tc = wRecord.context.targets.termsAll.Count() - __tc;



                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }

                    iterationStatus = crawlerDomainTaskIterationPhase.applyLinkRules;
                    evaluator.plugins.processAfterResultReceived(wRecord, this);
                    solSet = evaluator.operation_applyLinkRules(wRecord);

                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }

                    iterationStatus = crawlerDomainTaskIterationPhase.getLoadTask;
                    sTask           = evaluator.operation_GetLoadTask(wRecord);

                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }

                    iterationStatus = crawlerDomainTaskIterationPhase.loadingTask;
                    if (isLoaderDisabled)
                    {
                        wRecord.log("-- Loader component is disabled for this [" + wRecord.domain + "] task.");
                        sResult = new spiderTaskResult();
                    }
                    else
                    {
                        sResult = loader.runSpiderTask(sTask, wRecord);
                    }
                    if (isStageAborted)
                    {
                        Closing();
                        return;
                    }


                    parent.parent.dataLoadTaker.AddIteration();

                    iterationStatus = crawlerDomainTaskIterationPhase.updatingData;

                    if (evaluator.settings.doEnableCrossLinkDetection)
                    {
                        evaluator.operation_detectCrossLinks(wRecord);
                    }

                    iDataUnit.checkData();

                    targetLoaded   = iDataUnit.tc_loaded_p;
                    targetDetected = iDataUnit.tc_detected_p;



                    if (reporter != null)
                    {
                        try {
                            int lTC   = 0;
                            var iter2 = wRecord.iterationTableRecord.GetLastEntryTouched();
                            if (iter2 != null)
                            {
                                lTC = iter2.terms_all - lastTermCount;
                            }

                            reporter.reportIteration(iDataUnit, wRecord, evaluator); // <------ ovde se kreira nova iteracija
                            imbWEMManager.index.plugins.eventIteration(imbWEMManager.index.experimentEntry, this, wRecord);


                            parent.parent.dataLoadTaker.AddContentPage(lTC, sResult.Count);
                        }
                        catch (Exception ex)
                        {
                            throw new aceGeneralException(ex.Message, ex, reporter, "Reporter.reportIteration() exception");
                        }
                    }

                    parent.parent.reportPlugins.eventIteration(evaluator, this, wRecord);


                    iterationStatus = crawlerDomainTaskIterationPhase.checkingRules;

                    if (targetLoaded >= evaluator.settings.limitTotalPageLoad)
                    {
                        isStageAborted = true;
                        wRecord.log("--- Loaded pages count meet limit [" + targetLoaded + "] on iteration [" + iDataUnit.iteration + "].");
                    }

                    if (iDataUnit.iteration >= evaluator.settings.limitIterations)
                    {
                        isStageAborted = true;
                        wRecord.log("--- Iteration limit reached [" + iDataUnit.iteration + "].");
                    }


                    if (DateTime.Now.Subtract(startTime).TotalMinutes >= parent.parent._timeLimitForDLC)
                    {
                        isStageAborted = true;
                        wRecord.log("--- Timeout : crawler domain task [" + wRecord.web.seedLink.url + "] aborted after [" + DateTime.Now.Subtract(startTime).TotalMinutes + "] minutes.");
                    }

                    if (isStageAborted)
                    {
                        break;
                    }
                } while ((!stageControl.stage.CheckStage(wRecord, solSet, sTask)) && !isStageAborted);
                iterationStatus = crawlerDomainTaskIterationPhase.pageEvaluation;

                // <---- STAGE 3
                wRecord.resultPageSet = evaluator.operation_evaluatePages(wRecord);

                Closing();
            } catch (Exception ex)
            {
                crawlerErrorEnum errorType = crawlerErrorEnum.domainTaskError;

                switch (iterationStatus)
                {
                case crawlerDomainTaskIterationPhase.applyLinkRules:
                    errorType = crawlerErrorEnum.spiderModuleError;
                    break;

                case crawlerDomainTaskIterationPhase.getLoadTask:
                    errorType = crawlerErrorEnum.spiderGetTaskError;
                    break;

                case crawlerDomainTaskIterationPhase.loadingTask:
                    errorType = crawlerErrorEnum.spiderLoadingError;
                    break;

                case crawlerDomainTaskIterationPhase.pageEvaluation:
                    errorType = crawlerErrorEnum.spiderModuleError;
                    break;
                }

                string domainName = wRecord.domainInfo.domainName;

                if (!tRecord.crashedDomains.Contains(domainName))
                {
                    wRecord.log("Domain crashed first time: " + ex.Message);
                    aceLog.log("Domain [" + domainName + "] crashed first time: " + ex.Message);
                    aceLog.log("Domain [" + domainName + "] is restarting... ");
                    status = crawlerDomainTaskStatusEnum.waiting;
                    tRecord.crashedDomains.Add(wRecord.domainInfo.domainName);
                    reInitialization();
                    start();
                }
                else
                {
                    status = crawlerDomainTaskStatusEnum.aborted;

                    wRecord.log("Aborted by execution exception: " + ex.Message);
                }

                var clog = reporter.CreateAndSaveError(ex, wRecord, this, errorType);
                wRecord.log(clog.Message);
                //  crawlerErrorLog cel = new crawlerErrorLog(ex, wRecord, this, errorType);
            } finally
            {
            }

            aceLog.consoleControl.removeFromOutput(wRecord); //, "sp:" + tRecord.instance.name);
        }
Esempio n. 4
0
        //public abstract spiderTask operation_GetLoadTask(modelSpiderSiteRecord wRecord);

        /// <summary>
        /// E3: Performes ranking, selects the next task and drops links below
        /// </summary>
        /// <param name="stResult">The st result.</param>
        /// <param name="wRecord">The s record.</param>
        public virtual List <spiderPage> operation_evaluatePages(modelSpiderSiteRecord wRecord)
        {
            pageScoreRules.prepare();
            List <spiderPage> output = new List <spiderPage>();

            foreach (spiderPage pg in wRecord.web.webPages.items.Values)
            {
                if (pg.webpage.status == pageStatus.loaded)
                {
                    foreach (spiderEvalRuleForPageBase ruleForPage in pageScoreRules)
                    {
                        ruleForPage.learn(pg);
                    }
                    output.Add(pg);
                }
            }

            //foreach (spiderEvalRuleForPageBase ruleForPage in pageScoreRules)
            //{
            //    ruleForPage.AppendDataFields(wRecord.stats);
            //}


            List <spiderPage> outputTwo = new List <spiderPage>();

            foreach (spiderPage pg in output)
            {
                foreach (spiderEvalRuleForPageBase ruleForPage in pageScoreRules)
                {
                    spiderEvalRuleResult ruleResult = ruleForPage.evaluate(pg);
                    pg.marks.deploy(ruleResult);
                }
                int score = pg.marks.calculate(wRecord.iteration);

                if (score > -1)
                {
                    outputTwo.Add(pg);
                }
            }


            // <---------------------------------------------------------------------------------------- Application of page control rules
            spiderObjectiveSolutionSet obSet = new spiderObjectiveSolutionSet();

            foreach (controlPageRuleBase aRule in controlPageRules)
            {
                aRule.startIteration(wRecord.iteration, wRecord);
                foreach (spiderPage pg in output)
                {
                    obSet.listen(aRule.evaluate(pg, wRecord));
                }
            }


            foreach (spiderPage page in obSet.links)
            {
                if (outputTwo.Count() > settings.primaryPageSetSize)
                {
                    outputTwo.Remove(page);
                }
                else
                {
                    break;
                }
            }
            // <-------------------------------------------------------------------------------------------------------------------------

            outputTwo.Sort((x, y) => x.marks.score.CompareTo(y.marks.score)); // <----------------------- sorts the pages after cut


            if (settings.flags.HasFlag(spiderEvaluatorExecutionFlags.doTrimPrimaryOutput)) // <------------------- does the final trim if it is turned on
            {
                int tkc = Math.Min(settings.primaryPageSetSize, outputTwo.Count());
                outputTwo = outputTwo.Take(tkc).ToList();
            }

            wRecord.resultPageSet = outputTwo; // <------------------------------------------------------ transfers the final set to the record

            foreach (spiderPage pg in outputTwo)
            {
                var pRecord = wRecord.children.GetRecord(pg.spiderResult.target);

                pRecord.recordFinish(wRecord.resultPageSet); // <---------------------------------------- calls record finish for page records
            }

            return(outputTwo);
        }
        public override spiderObjectiveSolutionSet operation_applyLinkRules(modelSpiderSiteRecord wRecord)
        {
            spiderModuleData <spiderLink> dataInput = new spiderModuleData <spiderLink>();

            dataInput.iteration = wRecord.iteration;
            dataInput.active.AddRange(wRecord.web.webActiveLinks);

            frontierRankingAlgorithmIterationRecord frontierReportEntry = null;


            if (imbWEMManager.settings.directReportEngine.DR_ReportModules)
            {
                frontierReportEntry = wRecord.frontierDLC.reportStartOfFRA(wRecord.iteration, wRecord, dataInput); // <----------------- reporting on module activity -- START
            }

            foreach (ISpiderModuleBase module in modules)
            {
                module.startIteration(wRecord.iteration, wRecord);
            }


            bool breakExecution = false;

            foreach (ISpiderModuleBase module in modules)
            {
                if (imbWEMManager.settings.directReportEngine.DR_ReportModules)
                {
                    dataInput.moduleDLC = wRecord.frontierDLC.modRecords[module.GetType().Name];
                    dataInput.moduleDLCRecordTableEntry = dataInput.moduleDLC.StartNewRecord(wRecord.iteration);
                }

                spiderModuleData <spiderLink> dataOutput = null;
                if (!breakExecution)
                {
                    dataOutput = module.evaluate(dataInput, wRecord) as spiderModuleData <spiderLink>;
                }

                //dataInput.moduleDLC.reportEvaluateAlterRanking(dataOutput.active, wRecord, dataInput.moduleDLCRecordTableEntry, module as spiderModuleBase);

                if (imbWEMManager.settings.directReportEngine.DR_ReportModules)
                {
                    dataInput.moduleDLC.AddOrUpdate(dataInput.moduleDLCRecordTableEntry);
                    dataInput.moduleDLCRecordTableEntry.disposeResources();
                }

                if (!breakExecution)
                {
                    dataInput = dataOutput.CreateNext();

                    if (dataInput.active.Count == 1)
                    {
                        wRecord.log("Module " + module.name + " returned single link instance -- skipping other modules");
                        breakExecution = true;
                    }
                }
            }

            if (imbWEMManager.settings.directReportEngine.DR_ReportModules)
            {
                frontierReportEntry = wRecord.frontierDLC.reportEndOfFRA(wRecord, frontierReportEntry, dataInput); // <--------------------------------------------- reporting on module activity -- END
            }
            wRecord.currentModuleData = dataInput;



            // <------------------ Objective control rules

            spiderObjectiveSolutionSet output = new spiderObjectiveSolutionSet();

            foreach (controlObjectiveRuleBase aRule in controlRules)
            {
                aRule.startIteration(wRecord.iteration, wRecord);
                output.listen(aRule.evaluate(wRecord));
            }



            return(output);
        }