/// <summary> /// Ending procedure /// </summary> /// <exception cref="aceGeneralException">Reporter.reportDomainFinished() exception</exception> public void Closing() { if (closeCalled) { return; } closeCalled = true; // wRecord.children.FinishAllStarted(); wRecord.recordFinish(); //if (imbWEMManager.settings.TFIDF.doExploitStandardCC && wRecord.tRecord.instance.settings.doEnableDLC_TFIDF) //{ // imbWEMManager.index.domainIndexTable.SetSiteTFCompiled(wRecord.context.targets.dlTargetPageTokens, wRecord.domain); //} if (imbWEMManager.settings.indexEngine.doIndexUpdateOnDLC) { imbWEMManager.index.deployWRecord(wRecord); } if (reporter != null) { try { reporter.reportDomainFinished(wRecord); } catch (Exception ex) { var axe = new aceGeneralException(ex.Message, ex, reporter, "Reporter.reportDomainFinished() exception"); var clog = reporter.CreateAndSaveError(axe, wRecord, this, crawlerErrorEnum.DReportError); wRecord.log(clog.Message); throw axe; } } parent.parent.plugins.eventDLCFinished(parent.parent, this, wRecord); evaluator.plugins.eventDLCFinished(evaluator as spiderEvaluatorBase, this, wRecord); imbWEMManager.index.plugins.eventDLCFinished(imbWEMManager.index.experimentEntry, this, wRecord); parent.parent.reportPlugins.eventDLCFinished(evaluator, this, wRecord); if (isStageAborted) { status = crawlerDomainTaskStatusEnum.aborted; } else { status = crawlerDomainTaskStatusEnum.done; } if (imbWEMManager.settings.executionLog.doRemoveWRecordOnFinished) { // var tRecord = wRecord.tRecord; // var wGeneralRecord = wRecord.wGeneralRecord; tRecord.children.Remove(wRecord); tRecord.tGeneralRecord.children.Remove(wGeneralRecord); } if (wRecord.iteration < 2) { wRecord.log("Domain [" + wRecord.domain + "] considered as failed since less then two iterations were made on it"); parent.parent.webLoaderControler.SetFailedDomain(wRecord.wProfile, wRecord); } iterationStatus = crawlerDomainTaskIterationPhase.iterationProcessFinished; }
public abstract void eventUniversal <TFirst, TSecond>(crawlerDomainTaskIterationPhase stage, ISpiderEvaluatorBase __parent, TFirst __task, TSecond __resource);
/// <summary> /// Starts this instance. /// </summary> public void start() { iterationStatus = crawlerDomainTaskIterationPhase.iterationProcessNotStarted; status = crawlerDomainTaskStatusEnum.working; executionThread = Thread.CurrentThread; if (status == crawlerDomainTaskStatusEnum.aborted) { aceLog.log("Aborted DomainTask --> start()"); return; } lastIterationStart = DateTime.Now; startTime = DateTime.Now; aceLog.consoleControl.setAsOutput(wRecord, "" + wProfile.domain); parent.parent.reportPlugins.eventUniversal(crawlReportingStageEnum.DLCPreinitiation, reporter, this, wRecord); try { iterationStatus = crawlerDomainTaskIterationPhase.loadingSeedPage; // <--- STAGE 1 spiderWebLoader loader = new spiderWebLoader(parent.parent.dataLoadTaker); loader.controler = parent.parent.webLoaderControler; stageControl.prepare(); spiderTask sTask = evaluator.getSpiderSingleTask(web.seedLink, wRecord, 1); // <-------- spiderTaskResult sResult = loader.runSpiderTask(sTask, wRecord); // <--------------------------------------------------------[ izvršava if (sResult.calculateSuccessRate() == 0) { wRecord.log("Domain [" + wRecord.domain + "] is considered as failed since landing page load failed"); parent.parent.webLoaderControler.SetFailedDomain(wProfile, wRecord); } spiderObjectiveSolutionSet solSet = null; stageControl.stage.EnterStage(wRecord, evaluator); parent.parent.plugins.eventDLCInitiated(parent.parent, this, wRecord); //.eventDLCFinished(parent.parent, this, wRecord); evaluator.plugins.eventDLCInitiated(evaluator as spiderEvaluatorBase, this, wRecord); imbWEMManager.index.plugins.eventDLCInitiated(imbWEMManager.index.experimentEntry, this, wRecord); parent.parent.reportPlugins.eventDLCInitiated(reporter, this, wRecord); int lastTermCount = 0; // <--- STAGE 2 do { iterationStatus = crawlerDomainTaskIterationPhase.iterationStart; lastIterationStart = DateTime.Now; dataUnitSpiderIteration iDataUnit = wRecord.timeseries.CreateEntry(null, sTask.iteration); iterationStatus = crawlerDomainTaskIterationPhase.receiveResult; if (imbWEMManager.MASTERKILL_SWITCH) { aceLog.log("MASTERKILL SWITCH ON :: crawlerDomainTask->" + iterationStatus.ToString()); isStageAborted = true; sResult.items.Clear(); sResult.task.Clear(); evaluator.settings.limitIterations = wRecord.iteration - 5; evaluator.settings.limitTotalPageLoad = 0; Closing(); return; } if (isStageAborted) { Closing(); return; } evaluator.plugins.processLoaderResult(sResult, wRecord, this); // wRecord.context.targets.termsAll.Count(); var iter = wRecord.iterationTableRecord.GetLastEntryTouched(); if (iter != null) { lastTermCount = iter.terms_all; } evaluator.operation_receiveResult(sResult, wRecord); // __tc = wRecord.context.targets.termsAll.Count() - __tc; if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.applyLinkRules; evaluator.plugins.processAfterResultReceived(wRecord, this); solSet = evaluator.operation_applyLinkRules(wRecord); if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.getLoadTask; sTask = evaluator.operation_GetLoadTask(wRecord); if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.loadingTask; if (isLoaderDisabled) { wRecord.log("-- Loader component is disabled for this [" + wRecord.domain + "] task."); sResult = new spiderTaskResult(); } else { sResult = loader.runSpiderTask(sTask, wRecord); } if (isStageAborted) { Closing(); return; } parent.parent.dataLoadTaker.AddIteration(); iterationStatus = crawlerDomainTaskIterationPhase.updatingData; if (evaluator.settings.doEnableCrossLinkDetection) { evaluator.operation_detectCrossLinks(wRecord); } iDataUnit.checkData(); targetLoaded = iDataUnit.tc_loaded_p; targetDetected = iDataUnit.tc_detected_p; if (reporter != null) { try { int lTC = 0; var iter2 = wRecord.iterationTableRecord.GetLastEntryTouched(); if (iter2 != null) { lTC = iter2.terms_all - lastTermCount; } reporter.reportIteration(iDataUnit, wRecord, evaluator); // <------ ovde se kreira nova iteracija imbWEMManager.index.plugins.eventIteration(imbWEMManager.index.experimentEntry, this, wRecord); parent.parent.dataLoadTaker.AddContentPage(lTC, sResult.Count); } catch (Exception ex) { throw new aceGeneralException(ex.Message, ex, reporter, "Reporter.reportIteration() exception"); } } parent.parent.reportPlugins.eventIteration(evaluator, this, wRecord); iterationStatus = crawlerDomainTaskIterationPhase.checkingRules; if (targetLoaded >= evaluator.settings.limitTotalPageLoad) { isStageAborted = true; wRecord.log("--- Loaded pages count meet limit [" + targetLoaded + "] on iteration [" + iDataUnit.iteration + "]."); } if (iDataUnit.iteration >= evaluator.settings.limitIterations) { isStageAborted = true; wRecord.log("--- Iteration limit reached [" + iDataUnit.iteration + "]."); } if (DateTime.Now.Subtract(startTime).TotalMinutes >= parent.parent._timeLimitForDLC) { isStageAborted = true; wRecord.log("--- Timeout : crawler domain task [" + wRecord.web.seedLink.url + "] aborted after [" + DateTime.Now.Subtract(startTime).TotalMinutes + "] minutes."); } if (isStageAborted) { break; } } while ((!stageControl.stage.CheckStage(wRecord, solSet, sTask)) && !isStageAborted); iterationStatus = crawlerDomainTaskIterationPhase.pageEvaluation; // <---- STAGE 3 wRecord.resultPageSet = evaluator.operation_evaluatePages(wRecord); Closing(); } catch (Exception ex) { crawlerErrorEnum errorType = crawlerErrorEnum.domainTaskError; switch (iterationStatus) { case crawlerDomainTaskIterationPhase.applyLinkRules: errorType = crawlerErrorEnum.spiderModuleError; break; case crawlerDomainTaskIterationPhase.getLoadTask: errorType = crawlerErrorEnum.spiderGetTaskError; break; case crawlerDomainTaskIterationPhase.loadingTask: errorType = crawlerErrorEnum.spiderLoadingError; break; case crawlerDomainTaskIterationPhase.pageEvaluation: errorType = crawlerErrorEnum.spiderModuleError; break; } string domainName = wRecord.domainInfo.domainName; if (!tRecord.crashedDomains.Contains(domainName)) { wRecord.log("Domain crashed first time: " + ex.Message); aceLog.log("Domain [" + domainName + "] crashed first time: " + ex.Message); aceLog.log("Domain [" + domainName + "] is restarting... "); status = crawlerDomainTaskStatusEnum.waiting; tRecord.crashedDomains.Add(wRecord.domainInfo.domainName); reInitialization(); start(); } else { status = crawlerDomainTaskStatusEnum.aborted; wRecord.log("Aborted by execution exception: " + ex.Message); } var clog = reporter.CreateAndSaveError(ex, wRecord, this, errorType); wRecord.log(clog.Message); // crawlerErrorLog cel = new crawlerErrorLog(ex, wRecord, this, errorType); } finally { } aceLog.consoleControl.removeFromOutput(wRecord); //, "sp:" + tRecord.instance.name); }