protected spiderTask __operation_GetLoadTaskCommon(modelSpiderSiteRecord wRecord, IEnumerable <spiderLink> activeLinks) { operation_doControlAndStats(wRecord); // <------------------------------------------------------------------------------------------ int n = wRecord.context.GetNextIterationLTSize(activeLinks); // wRecord.logBuilder.log("Creating new spiderTask for iteration " + wRecord.iteration + " with " + n + " links to load. To Limit: " + toLimit); spiderTask outputTask = new spiderTask(wRecord.iteration + 1, wRecord.web); outputTask.AddRange(activeLinks.Take(n)); foreach (var ali in activeLinks) { if (!outputTask.Contains(ali)) { ali.marks.cycleRegistration(wRecord.iteration); } } return(outputTask); }
/// <summary> /// Creates single web loading task /// </summary> /// <param name="lnk">The LNK.</param> /// <param name="sReport">The s report.</param> /// <param name="iteration">The iteration.</param> /// <returns></returns> public virtual spiderTask getSpiderSingleTask(spiderLink lnk, modelSpiderSiteRecord sReport, int iteration) { spiderTask output = new spiderTask(iteration, sReport.web); // output.doTokenization = flags.HasFlag(spiderEvaluatorExecutionFlags.doTokenization); output.Add(lnk); return(output); }
public override spiderTask operation_GetLoadTask(modelSpiderSiteRecord wRecord) { //base.operation_GetLoadTask(wRecord); //operation_doControlAndStats(wRecord); //Int32 toLimit = settings.limitTotalPageLoad - (wRecord.context.targets.GetLoaded().Count - wRecord.duplicateCount); //Int32 n = Math.Min(wRecord.currentModuleData.active.Count, settings.limitIterationNewLinks); //, untillLimit); //n = Math.Min(n, toLimit); //wRecord.logBuilder.log("Creating new spiderTask for iteration " + wRecord.iteration + " with " + n + " links to load. Pageloads until limit: " + toLimit); spiderTask outputTask = __operation_GetLoadTaskCommon(wRecord, wRecord.currentModuleData.active); int c = 0; foreach (var task in wRecord.currentModuleData.active) { string lAge = task.linkAge.ToString("D2"); string lUrl = task.url; string lScore = task.marks.calculate(wRecord.iteration).ToString(); //.score.ToString(); string lineFormat = c.ToString("D2") + " {0,4} | {1, 30} | {2,6}"; if (outputTask.Contains(task)) { lineFormat += " (selected)"; } wRecord.logBuilder.AppendLine(string.Format(lineFormat, lAge, lUrl, lScore)); c++; } return(outputTask); }
/// <summary> /// Runs the spider task. /// </summary> /// <param name="sTask">The s task.</param> /// <param name="crawlerContext">The crawler context.</param> /// <returns></returns> public spiderTaskResult runSpiderTask(spiderTask sTask, modelSpiderSiteRecord wRecord) { spiderTaskResult sResult = sTask.createResult(); try { if (imbWEMManager.settings.crawlerJobEngine.crawlerDoParallelTaskLoads) { Parallel.ForEach(sTask, ln => { modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url); spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord); if (rItem.status != pageStatus.failed) { wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); // <-------------------------------- [ STIZE } sResult.AddResult(rItem); }); } else { foreach (spiderLink ln in sTask) { modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url); spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord); if (rItem.status != pageStatus.failed) { wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); } sResult.AddResult(rItem); } } } catch (Exception ex) { imbWEMManager.log.log("runSpiderTask exception: " + ex.Message); } loadIndex = loadIndex + sResult.Count(); if (loadIndex > imbWEMManager.settings.crawlerJobEngine.loadCountForGC) { long mem = GC.GetTotalMemory(false); GC.Collect(); GC.WaitForFullGCComplete(); long dmem = GC.GetTotalMemory(false); aceLog.log("Memory allocation reduction [after " + loadIndex + " tasks]: " + (mem - dmem).getMByteCountFormated()); loadIndex = 0; } sResult.finish(); return(sResult); }
public bool CheckStage(modelSpiderSiteRecord wRecord, spiderObjectiveSolutionSet oSet, spiderTask task) { bool okToLeave = false; if (task.Count() == 0) { wRecord.logBuilder.log("> Spider task [i:" + task.iteration + "] have no tasks defined. Aborting the stage loop."); okToLeave = true; return(okToLeave); } // <----------------------------- OBJECTIVE SOLUTION SET okToLeave = operation_executeObjectiveSolutionSet(oSet, wRecord); if (okToLeave) { return(okToLeave); } // <----------------------------- SPIDER LIMITS OVERRIDERS ---------------| if (stageIteration > wRecord.tRecord.instance.settings.limitIterations) { wRecord.log("> Spider settings (limit iterations) trigered abort at [" + stageIteration + "] Aborting the stage loop."); okToLeave = true; return(okToLeave); } // <----------------------------------------------------------------------| // <----------------------------- SPIDER LIMITS OVERRIDERS ---------------| if (wRecord.web.webPages.Count() > wRecord.tRecord.instance.settings.limitTotalPageLoad) { wRecord.log("> Spider settings (limit pages load) trigered abort at [" + wRecord.web.webPages.Count() + "] Aborting the stage loop."); okToLeave = true; return(okToLeave); } // <----------------------------------------------------------------------| if (stageIteration > stageIterationLimit) { wRecord.logBuilder.log("> Stage [" + name + "] iteration limit reached [ " + stageIterationLimit + " ] -- aborting [" + objectives.Count + "] objectives and move on"); okToLeave = true; return(okToLeave); } if (stageIteration > GLOBAL_stageIterationLimit) { Exception ex = new aceGeneralException("spiderStage [" + name + "] reached the " + nameof(GLOBAL_stageIterationLimit) + "(" + GLOBAL_stageIterationLimit.ToString() + ")"); throw ex; } stageIteration++; return(okToLeave); }
/// <summary> /// Starts this instance. /// </summary> public void start() { iterationStatus = crawlerDomainTaskIterationPhase.iterationProcessNotStarted; status = crawlerDomainTaskStatusEnum.working; executionThread = Thread.CurrentThread; if (status == crawlerDomainTaskStatusEnum.aborted) { aceLog.log("Aborted DomainTask --> start()"); return; } lastIterationStart = DateTime.Now; startTime = DateTime.Now; aceLog.consoleControl.setAsOutput(wRecord, "" + wProfile.domain); parent.parent.reportPlugins.eventUniversal(crawlReportingStageEnum.DLCPreinitiation, reporter, this, wRecord); try { iterationStatus = crawlerDomainTaskIterationPhase.loadingSeedPage; // <--- STAGE 1 spiderWebLoader loader = new spiderWebLoader(parent.parent.dataLoadTaker); loader.controler = parent.parent.webLoaderControler; stageControl.prepare(); spiderTask sTask = evaluator.getSpiderSingleTask(web.seedLink, wRecord, 1); // <-------- spiderTaskResult sResult = loader.runSpiderTask(sTask, wRecord); // <--------------------------------------------------------[ izvršava if (sResult.calculateSuccessRate() == 0) { wRecord.log("Domain [" + wRecord.domain + "] is considered as failed since landing page load failed"); parent.parent.webLoaderControler.SetFailedDomain(wProfile, wRecord); } spiderObjectiveSolutionSet solSet = null; stageControl.stage.EnterStage(wRecord, evaluator); parent.parent.plugins.eventDLCInitiated(parent.parent, this, wRecord); //.eventDLCFinished(parent.parent, this, wRecord); evaluator.plugins.eventDLCInitiated(evaluator as spiderEvaluatorBase, this, wRecord); imbWEMManager.index.plugins.eventDLCInitiated(imbWEMManager.index.experimentEntry, this, wRecord); parent.parent.reportPlugins.eventDLCInitiated(reporter, this, wRecord); int lastTermCount = 0; // <--- STAGE 2 do { iterationStatus = crawlerDomainTaskIterationPhase.iterationStart; lastIterationStart = DateTime.Now; dataUnitSpiderIteration iDataUnit = wRecord.timeseries.CreateEntry(null, sTask.iteration); iterationStatus = crawlerDomainTaskIterationPhase.receiveResult; if (imbWEMManager.MASTERKILL_SWITCH) { aceLog.log("MASTERKILL SWITCH ON :: crawlerDomainTask->" + iterationStatus.ToString()); isStageAborted = true; sResult.items.Clear(); sResult.task.Clear(); evaluator.settings.limitIterations = wRecord.iteration - 5; evaluator.settings.limitTotalPageLoad = 0; Closing(); return; } if (isStageAborted) { Closing(); return; } evaluator.plugins.processLoaderResult(sResult, wRecord, this); // wRecord.context.targets.termsAll.Count(); var iter = wRecord.iterationTableRecord.GetLastEntryTouched(); if (iter != null) { lastTermCount = iter.terms_all; } evaluator.operation_receiveResult(sResult, wRecord); // __tc = wRecord.context.targets.termsAll.Count() - __tc; if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.applyLinkRules; evaluator.plugins.processAfterResultReceived(wRecord, this); solSet = evaluator.operation_applyLinkRules(wRecord); if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.getLoadTask; sTask = evaluator.operation_GetLoadTask(wRecord); if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.loadingTask; if (isLoaderDisabled) { wRecord.log("-- Loader component is disabled for this [" + wRecord.domain + "] task."); sResult = new spiderTaskResult(); } else { sResult = loader.runSpiderTask(sTask, wRecord); } if (isStageAborted) { Closing(); return; } parent.parent.dataLoadTaker.AddIteration(); iterationStatus = crawlerDomainTaskIterationPhase.updatingData; if (evaluator.settings.doEnableCrossLinkDetection) { evaluator.operation_detectCrossLinks(wRecord); } iDataUnit.checkData(); targetLoaded = iDataUnit.tc_loaded_p; targetDetected = iDataUnit.tc_detected_p; if (reporter != null) { try { int lTC = 0; var iter2 = wRecord.iterationTableRecord.GetLastEntryTouched(); if (iter2 != null) { lTC = iter2.terms_all - lastTermCount; } reporter.reportIteration(iDataUnit, wRecord, evaluator); // <------ ovde se kreira nova iteracija imbWEMManager.index.plugins.eventIteration(imbWEMManager.index.experimentEntry, this, wRecord); parent.parent.dataLoadTaker.AddContentPage(lTC, sResult.Count); } catch (Exception ex) { throw new aceGeneralException(ex.Message, ex, reporter, "Reporter.reportIteration() exception"); } } parent.parent.reportPlugins.eventIteration(evaluator, this, wRecord); iterationStatus = crawlerDomainTaskIterationPhase.checkingRules; if (targetLoaded >= evaluator.settings.limitTotalPageLoad) { isStageAborted = true; wRecord.log("--- Loaded pages count meet limit [" + targetLoaded + "] on iteration [" + iDataUnit.iteration + "]."); } if (iDataUnit.iteration >= evaluator.settings.limitIterations) { isStageAborted = true; wRecord.log("--- Iteration limit reached [" + iDataUnit.iteration + "]."); } if (DateTime.Now.Subtract(startTime).TotalMinutes >= parent.parent._timeLimitForDLC) { isStageAborted = true; wRecord.log("--- Timeout : crawler domain task [" + wRecord.web.seedLink.url + "] aborted after [" + DateTime.Now.Subtract(startTime).TotalMinutes + "] minutes."); } if (isStageAborted) { break; } } while ((!stageControl.stage.CheckStage(wRecord, solSet, sTask)) && !isStageAborted); iterationStatus = crawlerDomainTaskIterationPhase.pageEvaluation; // <---- STAGE 3 wRecord.resultPageSet = evaluator.operation_evaluatePages(wRecord); Closing(); } catch (Exception ex) { crawlerErrorEnum errorType = crawlerErrorEnum.domainTaskError; switch (iterationStatus) { case crawlerDomainTaskIterationPhase.applyLinkRules: errorType = crawlerErrorEnum.spiderModuleError; break; case crawlerDomainTaskIterationPhase.getLoadTask: errorType = crawlerErrorEnum.spiderGetTaskError; break; case crawlerDomainTaskIterationPhase.loadingTask: errorType = crawlerErrorEnum.spiderLoadingError; break; case crawlerDomainTaskIterationPhase.pageEvaluation: errorType = crawlerErrorEnum.spiderModuleError; break; } string domainName = wRecord.domainInfo.domainName; if (!tRecord.crashedDomains.Contains(domainName)) { wRecord.log("Domain crashed first time: " + ex.Message); aceLog.log("Domain [" + domainName + "] crashed first time: " + ex.Message); aceLog.log("Domain [" + domainName + "] is restarting... "); status = crawlerDomainTaskStatusEnum.waiting; tRecord.crashedDomains.Add(wRecord.domainInfo.domainName); reInitialization(); start(); } else { status = crawlerDomainTaskStatusEnum.aborted; wRecord.log("Aborted by execution exception: " + ex.Message); } var clog = reporter.CreateAndSaveError(ex, wRecord, this, errorType); wRecord.log(clog.Message); // crawlerErrorLog cel = new crawlerErrorLog(ex, wRecord, this, errorType); } finally { } aceLog.consoleControl.removeFromOutput(wRecord); //, "sp:" + tRecord.instance.name); }