/// <summary> /// E2: Applies passive link rules to new Active links /// </summary> /// <param name="wRecord">The s record.</param> public override spiderObjectiveSolutionSet operation_applyLinkRules(modelSpiderSiteRecord wRecord) { spiderObjectiveSolutionSet output = new spiderObjectiveSolutionSet(); int c = 0; foreach (spiderLink sLink in Enumerable.Where(wRecord.web.webActiveLinks, x => !x.flags.HasFlag(spiderLinkFlags.passiveEvaluated))) { foreach (spiderEvalRuleForLinkBase rule in linkPassiveRules) { sLink.marks.deploy(rule.evaluate(sLink)); } c++; sLink.flags |= spiderLinkFlags.passiveEvaluated; } if (c > 0) { wRecord.log("Passive evaluation of [" + c + "] new links"); } /// cleaning rule memory foreach (ruleActiveBase aRule in linkActiveRules) { aRule.startIteration(wRecord.iteration, wRecord); } /// perceiving current situation foreach (spiderLink sLink in wRecord.web.webActiveLinks) { sLink.linkAge++; // <---------------------------------------------------- adding link age points foreach (ruleActiveBase aRule in linkActiveRules) { aRule.learn(sLink); } } /// apply update on results foreach (spiderLink sLink in wRecord.web.webActiveLinks) { foreach (ruleActiveBase aRule in linkActiveRules) { sLink.marks.deploy(aRule.evaluate(sLink)); } sLink.marks.calculate(wRecord.iteration); } // <----------------------------sorts the links wRecord.web.webActiveLinks.items.Sort((x, y) => y.marks.score.CompareTo(x.marks.score)); foreach (controlObjectiveRuleBase aRule in controlRules) { aRule.startIteration(wRecord.iteration, wRecord); output.listen(aRule.evaluate(wRecord)); } return(output); }
protected virtual void operation_doControlAndStats(modelSpiderSiteRecord wRecord) { var stats = wRecord.web.webActiveLinks.calculateTotalAndAvgScore(); wRecord.timeseries[wRecord.iteration].avg_score_l = stats.Item2; // wRecord.timeseries[wRecord.iteration].tc_detected_l = stats.Item1; // <---------------- Control rules spiderObjectiveSolutionSet output = new spiderObjectiveSolutionSet(); /// cleaning rule memory foreach (controlLinkRuleBase aRule in controlLinkRules) { aRule.startIteration(wRecord.iteration, wRecord); } /// perceiving current situation foreach (spiderLink sLink in wRecord.web.webActiveLinks) { foreach (controlLinkRuleBase aRule in controlLinkRules) { aRule.learn(sLink, wRecord); } } /// --------- TRIM BELOW ZERO ------------ /// if (settings.FRONTIER_PullDecayModes.HasFlag(spiderPullDecayModes.belowZeroScoreRemoval)) { foreach (spiderLink sLink in wRecord.web.webActiveLinks.ToList()) { if (sLink.marks.score < 0) { wRecord.web.webActiveLinks.Remove(sLink); wRecord.log("Link [" + sLink.url + "] had score below zero"); } } } /// apply update on results foreach (spiderLink sLink in wRecord.web.webActiveLinks) { foreach (controlLinkRuleBase aRule in controlLinkRules) { output.listen(aRule.evaluate(sLink, wRecord)); } } int removed = 0; foreach (var link in output.links) { // <--------------------------------------------------- removes any links found at control solution set wRecord.web.webActiveLinks.Remove(link); removed++; } if (removed > 0) { wRecord.log("Control rules removed: " + removed.ToString() + " links from active links collection"); } //wRecord.logBuilder.log("Link drop-out:" + output.links.Count + ". Now have " + wRecord.web.webActiveLinks.Count() + " links waiting."); stats = wRecord.web.webActiveLinks.items.calculateTotalAndAvgScore(); wRecord.timeseries[wRecord.iteration].avg_scoreADO_l = stats.Item2; wRecord.timeseries[wRecord.iteration].tc_scoreADO_l = stats.Item1; wRecord.timeseries[wRecord.iteration].nw_ruledout_l = output.links.Count; }
/// <summary> /// Starts this instance. /// </summary> public void start() { iterationStatus = crawlerDomainTaskIterationPhase.iterationProcessNotStarted; status = crawlerDomainTaskStatusEnum.working; executionThread = Thread.CurrentThread; if (status == crawlerDomainTaskStatusEnum.aborted) { aceLog.log("Aborted DomainTask --> start()"); return; } lastIterationStart = DateTime.Now; startTime = DateTime.Now; aceLog.consoleControl.setAsOutput(wRecord, "" + wProfile.domain); parent.parent.reportPlugins.eventUniversal(crawlReportingStageEnum.DLCPreinitiation, reporter, this, wRecord); try { iterationStatus = crawlerDomainTaskIterationPhase.loadingSeedPage; // <--- STAGE 1 spiderWebLoader loader = new spiderWebLoader(parent.parent.dataLoadTaker); loader.controler = parent.parent.webLoaderControler; stageControl.prepare(); spiderTask sTask = evaluator.getSpiderSingleTask(web.seedLink, wRecord, 1); // <-------- spiderTaskResult sResult = loader.runSpiderTask(sTask, wRecord); // <--------------------------------------------------------[ izvršava if (sResult.calculateSuccessRate() == 0) { wRecord.log("Domain [" + wRecord.domain + "] is considered as failed since landing page load failed"); parent.parent.webLoaderControler.SetFailedDomain(wProfile, wRecord); } spiderObjectiveSolutionSet solSet = null; stageControl.stage.EnterStage(wRecord, evaluator); parent.parent.plugins.eventDLCInitiated(parent.parent, this, wRecord); //.eventDLCFinished(parent.parent, this, wRecord); evaluator.plugins.eventDLCInitiated(evaluator as spiderEvaluatorBase, this, wRecord); imbWEMManager.index.plugins.eventDLCInitiated(imbWEMManager.index.experimentEntry, this, wRecord); parent.parent.reportPlugins.eventDLCInitiated(reporter, this, wRecord); int lastTermCount = 0; // <--- STAGE 2 do { iterationStatus = crawlerDomainTaskIterationPhase.iterationStart; lastIterationStart = DateTime.Now; dataUnitSpiderIteration iDataUnit = wRecord.timeseries.CreateEntry(null, sTask.iteration); iterationStatus = crawlerDomainTaskIterationPhase.receiveResult; if (imbWEMManager.MASTERKILL_SWITCH) { aceLog.log("MASTERKILL SWITCH ON :: crawlerDomainTask->" + iterationStatus.ToString()); isStageAborted = true; sResult.items.Clear(); sResult.task.Clear(); evaluator.settings.limitIterations = wRecord.iteration - 5; evaluator.settings.limitTotalPageLoad = 0; Closing(); return; } if (isStageAborted) { Closing(); return; } evaluator.plugins.processLoaderResult(sResult, wRecord, this); // wRecord.context.targets.termsAll.Count(); var iter = wRecord.iterationTableRecord.GetLastEntryTouched(); if (iter != null) { lastTermCount = iter.terms_all; } evaluator.operation_receiveResult(sResult, wRecord); // __tc = wRecord.context.targets.termsAll.Count() - __tc; if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.applyLinkRules; evaluator.plugins.processAfterResultReceived(wRecord, this); solSet = evaluator.operation_applyLinkRules(wRecord); if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.getLoadTask; sTask = evaluator.operation_GetLoadTask(wRecord); if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.loadingTask; if (isLoaderDisabled) { wRecord.log("-- Loader component is disabled for this [" + wRecord.domain + "] task."); sResult = new spiderTaskResult(); } else { sResult = loader.runSpiderTask(sTask, wRecord); } if (isStageAborted) { Closing(); return; } parent.parent.dataLoadTaker.AddIteration(); iterationStatus = crawlerDomainTaskIterationPhase.updatingData; if (evaluator.settings.doEnableCrossLinkDetection) { evaluator.operation_detectCrossLinks(wRecord); } iDataUnit.checkData(); targetLoaded = iDataUnit.tc_loaded_p; targetDetected = iDataUnit.tc_detected_p; if (reporter != null) { try { int lTC = 0; var iter2 = wRecord.iterationTableRecord.GetLastEntryTouched(); if (iter2 != null) { lTC = iter2.terms_all - lastTermCount; } reporter.reportIteration(iDataUnit, wRecord, evaluator); // <------ ovde se kreira nova iteracija imbWEMManager.index.plugins.eventIteration(imbWEMManager.index.experimentEntry, this, wRecord); parent.parent.dataLoadTaker.AddContentPage(lTC, sResult.Count); } catch (Exception ex) { throw new aceGeneralException(ex.Message, ex, reporter, "Reporter.reportIteration() exception"); } } parent.parent.reportPlugins.eventIteration(evaluator, this, wRecord); iterationStatus = crawlerDomainTaskIterationPhase.checkingRules; if (targetLoaded >= evaluator.settings.limitTotalPageLoad) { isStageAborted = true; wRecord.log("--- Loaded pages count meet limit [" + targetLoaded + "] on iteration [" + iDataUnit.iteration + "]."); } if (iDataUnit.iteration >= evaluator.settings.limitIterations) { isStageAborted = true; wRecord.log("--- Iteration limit reached [" + iDataUnit.iteration + "]."); } if (DateTime.Now.Subtract(startTime).TotalMinutes >= parent.parent._timeLimitForDLC) { isStageAborted = true; wRecord.log("--- Timeout : crawler domain task [" + wRecord.web.seedLink.url + "] aborted after [" + DateTime.Now.Subtract(startTime).TotalMinutes + "] minutes."); } if (isStageAborted) { break; } } while ((!stageControl.stage.CheckStage(wRecord, solSet, sTask)) && !isStageAborted); iterationStatus = crawlerDomainTaskIterationPhase.pageEvaluation; // <---- STAGE 3 wRecord.resultPageSet = evaluator.operation_evaluatePages(wRecord); Closing(); } catch (Exception ex) { crawlerErrorEnum errorType = crawlerErrorEnum.domainTaskError; switch (iterationStatus) { case crawlerDomainTaskIterationPhase.applyLinkRules: errorType = crawlerErrorEnum.spiderModuleError; break; case crawlerDomainTaskIterationPhase.getLoadTask: errorType = crawlerErrorEnum.spiderGetTaskError; break; case crawlerDomainTaskIterationPhase.loadingTask: errorType = crawlerErrorEnum.spiderLoadingError; break; case crawlerDomainTaskIterationPhase.pageEvaluation: errorType = crawlerErrorEnum.spiderModuleError; break; } string domainName = wRecord.domainInfo.domainName; if (!tRecord.crashedDomains.Contains(domainName)) { wRecord.log("Domain crashed first time: " + ex.Message); aceLog.log("Domain [" + domainName + "] crashed first time: " + ex.Message); aceLog.log("Domain [" + domainName + "] is restarting... "); status = crawlerDomainTaskStatusEnum.waiting; tRecord.crashedDomains.Add(wRecord.domainInfo.domainName); reInitialization(); start(); } else { status = crawlerDomainTaskStatusEnum.aborted; wRecord.log("Aborted by execution exception: " + ex.Message); } var clog = reporter.CreateAndSaveError(ex, wRecord, this, errorType); wRecord.log(clog.Message); // crawlerErrorLog cel = new crawlerErrorLog(ex, wRecord, this, errorType); } finally { } aceLog.consoleControl.removeFromOutput(wRecord); //, "sp:" + tRecord.instance.name); }
//public abstract spiderTask operation_GetLoadTask(modelSpiderSiteRecord wRecord); /// <summary> /// E3: Performes ranking, selects the next task and drops links below /// </summary> /// <param name="stResult">The st result.</param> /// <param name="wRecord">The s record.</param> public virtual List <spiderPage> operation_evaluatePages(modelSpiderSiteRecord wRecord) { pageScoreRules.prepare(); List <spiderPage> output = new List <spiderPage>(); foreach (spiderPage pg in wRecord.web.webPages.items.Values) { if (pg.webpage.status == pageStatus.loaded) { foreach (spiderEvalRuleForPageBase ruleForPage in pageScoreRules) { ruleForPage.learn(pg); } output.Add(pg); } } //foreach (spiderEvalRuleForPageBase ruleForPage in pageScoreRules) //{ // ruleForPage.AppendDataFields(wRecord.stats); //} List <spiderPage> outputTwo = new List <spiderPage>(); foreach (spiderPage pg in output) { foreach (spiderEvalRuleForPageBase ruleForPage in pageScoreRules) { spiderEvalRuleResult ruleResult = ruleForPage.evaluate(pg); pg.marks.deploy(ruleResult); } int score = pg.marks.calculate(wRecord.iteration); if (score > -1) { outputTwo.Add(pg); } } // <---------------------------------------------------------------------------------------- Application of page control rules spiderObjectiveSolutionSet obSet = new spiderObjectiveSolutionSet(); foreach (controlPageRuleBase aRule in controlPageRules) { aRule.startIteration(wRecord.iteration, wRecord); foreach (spiderPage pg in output) { obSet.listen(aRule.evaluate(pg, wRecord)); } } foreach (spiderPage page in obSet.links) { if (outputTwo.Count() > settings.primaryPageSetSize) { outputTwo.Remove(page); } else { break; } } // <------------------------------------------------------------------------------------------------------------------------- outputTwo.Sort((x, y) => x.marks.score.CompareTo(y.marks.score)); // <----------------------- sorts the pages after cut if (settings.flags.HasFlag(spiderEvaluatorExecutionFlags.doTrimPrimaryOutput)) // <------------------- does the final trim if it is turned on { int tkc = Math.Min(settings.primaryPageSetSize, outputTwo.Count()); outputTwo = outputTwo.Take(tkc).ToList(); } wRecord.resultPageSet = outputTwo; // <------------------------------------------------------ transfers the final set to the record foreach (spiderPage pg in outputTwo) { var pRecord = wRecord.children.GetRecord(pg.spiderResult.target); pRecord.recordFinish(wRecord.resultPageSet); // <---------------------------------------- calls record finish for page records } return(outputTwo); }
public override spiderObjectiveSolutionSet operation_applyLinkRules(modelSpiderSiteRecord wRecord) { spiderModuleData <spiderLink> dataInput = new spiderModuleData <spiderLink>(); dataInput.iteration = wRecord.iteration; dataInput.active.AddRange(wRecord.web.webActiveLinks); frontierRankingAlgorithmIterationRecord frontierReportEntry = null; if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { frontierReportEntry = wRecord.frontierDLC.reportStartOfFRA(wRecord.iteration, wRecord, dataInput); // <----------------- reporting on module activity -- START } foreach (ISpiderModuleBase module in modules) { module.startIteration(wRecord.iteration, wRecord); } bool breakExecution = false; foreach (ISpiderModuleBase module in modules) { if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { dataInput.moduleDLC = wRecord.frontierDLC.modRecords[module.GetType().Name]; dataInput.moduleDLCRecordTableEntry = dataInput.moduleDLC.StartNewRecord(wRecord.iteration); } spiderModuleData <spiderLink> dataOutput = null; if (!breakExecution) { dataOutput = module.evaluate(dataInput, wRecord) as spiderModuleData <spiderLink>; } //dataInput.moduleDLC.reportEvaluateAlterRanking(dataOutput.active, wRecord, dataInput.moduleDLCRecordTableEntry, module as spiderModuleBase); if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { dataInput.moduleDLC.AddOrUpdate(dataInput.moduleDLCRecordTableEntry); dataInput.moduleDLCRecordTableEntry.disposeResources(); } if (!breakExecution) { dataInput = dataOutput.CreateNext(); if (dataInput.active.Count == 1) { wRecord.log("Module " + module.name + " returned single link instance -- skipping other modules"); breakExecution = true; } } } if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { frontierReportEntry = wRecord.frontierDLC.reportEndOfFRA(wRecord, frontierReportEntry, dataInput); // <--------------------------------------------- reporting on module activity -- END } wRecord.currentModuleData = dataInput; // <------------------ Objective control rules spiderObjectiveSolutionSet output = new spiderObjectiveSolutionSet(); foreach (controlObjectiveRuleBase aRule in controlRules) { aRule.startIteration(wRecord.iteration, wRecord); output.listen(aRule.evaluate(wRecord)); } return(output); }