public void eventDLCFinished(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <indexMaintenanceStageEnum, experimentSessionEntry> plug in allPlugins) { plug.eventDLCFinished(__parent as experimentSessionEntry, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { if (!__wRecord.tRecord.instance.settings.FRONTIER_doLinkHarvest) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); List <indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName); var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault(); var spage = seedTarget?.page; if (spage != null) { loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url); } //FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain); foreach (indexPage p in pages) { link l = new link(p.url); // if (__wRecord.web.webActiveLinks.Contains()) __wRecord.context.processLink(l, spage, false); } } }
public void eventDLCFinished(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <crawlerDomainTaskIterationPhase, spiderEvaluatorBase> plug in allPlugins) { plug.eventDLCFinished(__parent as spiderEvaluatorBase, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public void eventDLCFinished(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <crawlReportingStageEnum, directReporterBase> plug in allPlugins) { plug.eventDLCFinished(__parent as directReporterBase, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public void eventIteration(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (indexPlugIn_base plug in allPlugins) { plug.eventIteration(__parent as experimentSessionEntry, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public void eventDLCInitiated(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <crawlReportingStageEnum, directReporterBase> plug in allPlugins) { plug.eventDLCInitiated(null, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public void eventDLCInitiated(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <crawlJobEngineStageEnum, crawlerDomainTaskMachine> plug in allPlugins) { plug.eventDLCInitiated(__parent as crawlerDomainTaskMachine, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public override void eventUniversal <TTask, TBase>(crawlJobEngineStageEnum stage, crawlerDomainTaskMachine __machine, TTask __task, TBase __spider) { if (!IsEnabled) { return; } if (plugins[stage].Any(x => x.IsEnabled)) { crawlerDomainTask tsk = __task as crawlerDomainTask; foreach (IPlugInCommonBase <crawlJobEngineStageEnum, crawlerDomainTaskMachine> plug in plugins[stage]) { try { plug.eventUniversal(stage, __machine, __task, __spider); //if (plug is ISpiderPlugInForContent) ((ISpiderPlugInForContent)plug).processAfterResultReceived(wRecord, wTask); } catch (Exception ex) { aceLog.log("Engine Plugin [" + plug.name + "]:" + plug.GetType().Name + " at " + stage.ToString() + " execution crashed: " + ex.Message); crawlerErrorLog cel = new crawlerErrorLog(ex, null, tsk, crawlerErrorEnum.enginePlugin); cel.SaveXML(); } } } }
public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = webSiteReposByDomain[__wRecord.domain]; mcRepo.siteTable.AddOrUpdate(wRepo.entry); wRepo.SaveDataStructure(mcRepo.folder, loger); }
public override void eventDLCInitiated(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { if (imbWEMManager.settings.directReportEngine.doDomainReport) { string dlc_config = imbWEMManager.index.experimentEntry.sessionCrawlerFolder["sites"].pathFor("dlc_config_" + __wRecord.domainInfo.domainRootName.getFilename(".txt")); builderForMarkdown builder = new builderForMarkdown(); spiderTools.Describe(__task.evaluator, builder); builder.ToString().saveStringToFile(dlc_config); } }
public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { // imbWEMManager.index.domainIndexTable var state = __session.state; indexDomain idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); List <indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName); /* * __session.state.crawler.settings.FRONTIER_doLinkHarvest = false; * __session.state.crawler.settings.FRONTIER_doLinkResolver = false; */ var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault(); //.webPages.items.Values.First(); var spage = seedTarget?.page; if (spage != null) { loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url); } FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain); if ((!dlcFile.Exists) || imbWEMManager.settings.TFIDF.doSchedulePagesWithDLCTable) { foreach (indexPage p in pages) { link l = new link(p.url); if (!p.url.Contains(__wRecord.domainInfo.domainRootName)) { loger.AppendLine(__wRecord.domain + " -X-> " + p.url + " Wrong link association?"); aceTerminalInput.doBeepViaConsole(1600, 200, 3); } __wRecord.context.processLink(l, spage, false); } loger.AppendLine(__wRecord.domain + " -> " + __wRecord.web.webActiveLinks.Count + " targets set for load"); } else { loger.AppendLine(__wRecord.domain + " -> DLC cache found: " + dlcFile.FullName); } }
public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { if (__task.status == crawlerDomainTaskStatusEnum.aborted) { return; } if (__wRecord.iterationTableRecord == null) { return; } if (__wRecord.iterationTableRecord.Count == 0) { return; } indexDomain iDomain = records.GetOrCreate(__wRecord.instanceID); iDomain.url = __wRecord.domainInfo.urlProper; iDomain.domain = __wRecord.domain; var lastRec = __wRecord.iterationTableRecord.LastOrDefault(); var firstRec = __wRecord.iterationTableRecord.FirstOrDefault(); iDomain.relevantPages = lastRec.relevantPageCount; iDomain.notRelevantPages = lastRec.irrelevantPageCount; iDomain.detected = __wRecord.web.webActiveLinks.Count(); iDomain.Words = __wRecord.context.targets.termsAll.Count(); iDomain.LandingLanguage = firstRec.targetLanguage; iDomain.LandingRelevant = firstRec.relevantPageCount > 0; records.AddOrUpdate(iDomain, objectTableUpdatePolicy.updateIfHigher); DLCCount++; if (DLCCount >= imbWEMManager.settings.supportEngine.reportPlugIn_sideIndexer_DLCToSave) { DLCCount = 0; SaveAll(); output.log("Side Index save and publish triggered on [" + __task.parent.parent.taskDone + "] DLC completed"); } }
public void eventIteration(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { if (!IsEnabled) { return; } foreach (reportPlugIn_base plug in allPlugins) { try { plug.eventIteration(__spider, __task, __wRecord); } catch (Exception ex) { aceLog.log("Reporting Plugin [" + plug.name + "]:" + plug.GetType().Name + " at status report execution crashed: " + ex.Message); crawlerErrorLog cel = new crawlerErrorLog(ex, null, null, crawlerErrorEnum.indexPlugin); cel.SaveXML(); } } }
public override void eventDLCFinished(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { if (settings.plugIn_indexDBUpdater_TFIDF_per_DLC) { FileInfo master_file = session.GetTFIDF_Master_File(); session.doDomainEvaluation(settings, loger, __wRecord, evaluator, new weightTableCompiled(master_file.FullName, true, session.SessionID)); } else { session.doDomainEvaluation(settings, loger, __wRecord, evaluator, session.GetTFIDF_Master(loger, true, false)); } loger.AppendLine("Last index save: " + imbWEMManager.index.lastIndexSave.ToShortTimeString() + " [" + imbWEMManager.index.wRecordsDeployed + " / " + settings.doIndexAutoSaveOnDLCs + " ] "); if (imbWEMManager.index.wRecordsDeployed >= settings.doIndexAutoSaveOnDLCs) { imbWEMManager.index.Save(); } }
public override void eventUniversal(crawlReportingStageEnum stage, directReporterBase __parent, crawlerDomainTask __task, modelSpiderSiteRecord wRecord) { switch (stage) { case crawlReportingStageEnum.DLCPreinitiation: wRecord.context.OnTargetPageAttached += new modelSpiderSiteRecordEvent(onTargetPageAttached); imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = mcRepo.GetWebSite(wRecord.domainInfo, true, loger); if (!webSiteReposByDomain.ContainsKey(wRecord.domain)) { webSiteReposByDomain.Add(wRecord.domain, wRepo); } else { loger.log("DLC sent to CrawlToMC plugin second time: " + wRecord.domain); } mcRepo.siteTable.AddOrUpdate(wRepo.entry); wRepo.SaveDataStructure(mcRepo.folder, loger); break; } }
/// <summary> /// Just when new DLC thread was prepared to run /// </summary> /// <param name="__spider">The spider.</param> /// <param name="__task">The task.</param> /// <param name="__wRecord">The w record.</param> public abstract void eventDLCInitiated(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
public override void eventIteration(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { }
public abstract void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
internal crawlerErrorLog CreateAndSaveError(Exception ex, modelSpiderSiteRecord wRecord, crawlerDomainTask crawlerDomainTask, crawlerErrorEnum errorType) { crawlerErrorLog clog = crawlerErrorLog.CreateAndSave(ex, wRecord, crawlerDomainTask, errorType); clog.SaveXML(folder[DRFolderEnum.logs].pathFor("DLC_crash_" + wRecord.domainInfo.domainRootName.getFilename())); return(clog); }
public abstract void eventIteration(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
// public abstract void eventPluginInstalled(directReporterBase __spider); public abstract void eventUniversal(crawlReportingStageEnum stage, directReporterBase __parent, crawlerDomainTask __task, modelSpiderSiteRecord wRecord);
public override void eventDLCInitiated <TParent>(TParent __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) => eventDLCInitiated(__parent as crawlerDomainTaskMachine, __task, __wRecord);
public abstract void eventDLCInitiated <TParent>(TParent __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
public void eventDLCInitiated(crawlerDomainTaskMachine __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { }
public override void eventUniversal <TFirst, TSecond>(crawlJobEngineStageEnum stage, crawlerDomainTaskMachine __machine, TFirst __task, TSecond __resource) { switch (stage) { case crawlJobEngineStageEnum.statusReport: //var tMemory = __machine.measureTaker.GetTrend(trendMemory); //var tCPU = __machine.measureTaker.GetTrend(trendCPU); //var tDataLoad = __machine.measureTaker.GetTrend(trendDataLoad); //var tContentPages = __machine.dataLoadTaker.GetTrend(trendContentPages); //var tContentTerms = __machine.dataLoadTaker.GetTrend(trendContentTerms); //var tIterations = __machine.dataLoadTaker.GetTrend(trendIterations); //loger.AppendLine(String.Format(TREND_LINE, tMemory.GetTrendInline(), tCPU.GetTrendInline(), tDataLoad.GetTrendInline())); //loger.AppendLine(String.Format(TREND_LINE, tContentPages.GetTrendInline(), tContentTerms.GetTrendInline(), tIterations.GetTrendInline())); int change = 0; int newTC = __machine.maxThreads; double maxLatencyToLimit = 0; double maxLatency = 0; bool doBoost = false; string domainThatLates = ""; string threadId = ""; Thread criticalThread = null; double average = CPUAverageLast; double avgChange = average - CPUAverageLast; double maxAge = 0; crawlerDomainTask taskOldest = null; var tasks = __machine.task_running.ToList(); foreach (Task task in tasks) { crawlerDomainTask taskInRun = task.AsyncState as crawlerDomainTask; double since = taskInRun.sinceLastIterationStart; double tage = DateTime.Now.Subtract(taskInRun.startTime).TotalMinutes; maxLatency = Math.Max(maxLatency, since); if (maxAge <= tage) { maxAge = tage; taskOldest = taskInRun; } if (maxLatency <= since) { domainThatLates = taskInRun.wRecord.domain; if (taskInRun?.executionThread != null) { threadId = taskInRun.executionThread.ManagedThreadId.ToString() + " [" + taskInRun.executionThread.Priority.ToString() + "]"; } criticalThread = taskInRun.executionThread; } } maxLatencyToLimit = maxLatency.GetRatio(__machine.TimeLimitForTask); double maxAgeLimit = maxAge.GetRatio(__machine._timeLimitForDLC); double totalAgeLimit = DateTime.Now.Subtract(__machine.startTime).TotalMinutes.GetRatio(__machine.TimeLimitForCompleteJob); loger.log("Max. latency: [" + maxLatency.ToString("F2") + " min][" + maxLatencyToLimit.ToString("P2") + "] " + domainThatLates + " Thread: " + threadId); if (taskOldest != null) { loger.log("Oldest DLC: [" + maxAge.ToString("F2") + " min][" + maxAgeLimit.ToString("P2") + "] " + taskOldest.wRecord.domain + " Thread: " + taskOldest.executionThread.ManagedThreadId.ToString() + " [" + taskOldest.executionThread.Priority.ToString() + "]"); } #region TIMEOUT PREVENTION ----------------------------------------- if (imbWEMManager.settings.crawlerJobEngine.doTaskTimeOutPrevention) { if (totalAgeLimit > 0.9) { bool newDisable = false; foreach (Task task in tasks) { crawlerDomainTask t = task.AsyncState as crawlerDomainTask; if (!t.isLoaderDisabled) { t.isLoaderDisabled = true; newDisable = true; loger.log("Time Limit Critical: loader is disabled for: " + t.wRecord.domain + " due execution time limit for Thread: " + t.executionThread.ManagedThreadId.ToString()); } } if (newDisable) { aceTerminalInput.doBeepViaConsole(1200, 250, 5); } } if (maxAgeLimit > 0.9) { if (!taskOldest.isLoaderDisabled) { taskOldest.isLoaderDisabled = true; loger.consoleAltColorToggle(); loger.log("DLC Time Limit Critical: loader is disabled for: " + taskOldest.wRecord.domain + " due execution time limit for Thread: " + taskOldest.executionThread.ManagedThreadId.ToString()); loger.consoleAltColorToggle(); aceTerminalInput.doBeepViaConsole(); } } doBoost = false; if (maxLatencyToLimit > 0.5) { if (criticalThread != null) { criticalThread.Priority = ThreadPriority.AboveNormal; } change = -2; } else if (maxLatencyToLimit > 0.70) { if (criticalThread != null) { criticalThread.Priority = ThreadPriority.Highest; } change = -4; } else if (maxLatencyToLimit > 0.90) { loger.log("Max. latency critical :: REDUCING TO SINGLE THREAD : "); foreach (Task task in tasks) { crawlerDomainTask taskInRun = task.AsyncState as crawlerDomainTask; if (taskInRun?.executionThread != null) { taskInRun.executionThread.Priority = ThreadPriority.BelowNormal; } } if (criticalThread != null) { criticalThread.Priority = ThreadPriority.Highest; } newTC = 1; } else { foreach (Task task in tasks) { crawlerDomainTask taskInRun = task.AsyncState as crawlerDomainTask; if (taskOldest == taskInRun) { if (taskInRun?.executionThread != null) { taskInRun.executionThread.Priority = ThreadPriority.AboveNormal; } } else { if (taskInRun?.executionThread != null) { taskInRun.executionThread.Priority = ThreadPriority.Normal; } } } doBoost = true; } } if (imbWEMManager.settings.crawlerJobEngine.doAutoAdjustTC) { #endregion --------------------------- ^ timeout prevention ^^ if (doBoost) // <------ TC adjust { var takes = __machine.cpuTaker.GetLastSamples(imbWEMManager.settings.crawlerJobEngine.CPUSampleForAutoAdjustMax); if (takes.Count < imbWEMManager.settings.crawlerJobEngine.CPUSampleForAutoAdjust) { return; } average = (takes.Average(x => x.reading) / 100); avgChange = average - CPUAverageLast; double CPUMargin = imbWEMManager.settings.crawlerJobEngine.CPUMargin; int dlc = __machine.taskRunning; CPUAverageDefendLine = Math.Max(average, CPUAverageLast); if (dlc < (__machine.maxThreads - 1)) { return; } if (average < imbWEMManager.settings.crawlerJobEngine.CPUTarget) { if (average < (CPUAverageDefendLine - CPUMargin)) { change = -1; } else { change = 1; } } else if (average > imbWEMManager.settings.crawlerJobEngine.CPULimit) { change = -1; } newTC = Math.Min(__machine.maxThreads + change, imbWEMManager.settings.crawlerJobEngine.TCAutoLimit); if (newTC < 0) { newTC = 1; } CPUAverageLast = average; } else { if (change != 0) { newTC = Math.Min(__machine.maxThreads + change, imbWEMManager.settings.crawlerJobEngine.TCAutoLimit); } if (newTC < 0) { newTC = 1; } } } int e_change = newTC - __machine.maxThreads; __machine.maxThreads = newTC; loger.log("CPU average [" + average.ToString("P2") + "][" + avgChange.ToString("P2") + "] - (change: " + e_change + ") TC: " + __machine.maxThreads.ToString("D3") + " DLC:[" + __machine.taskRunning.ToString("D3") + "]"); /* * if (average < imbWEMManager.settings.crawlerJobEngine.CPUTarget) * { * * ; * * } * else if (average > imbWEMManager.settings.crawlerJobEngine.CPULimit) * { * __machine.maxThreads = Math.Min(__machine.maxThreads - 1, imbWEMManager.settings.crawlerJobEngine.TCAutoLimit); * loger.log("CPU average [" + average.ToString("P2") + "] > reducing TC to: " + __machine.maxThreads.ToString("D3") + " DLC:[" + __machine.taskRunning.ToString("D3") + "]"); * } * else * { * loger.log("CPU average [" + average.ToString("P2") + "] ---------- TC_max: " + __machine.maxThreads.ToString("D3") + " DLC:[" + __machine.taskRunning.ToString("D3") + "]"); * } */ break; } }
public override void eventDLCInitiated <TParent>(TParent __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) => eventDLCInitiated(__parent as directReporterBase, __task, __wRecord);
public abstract void eventDLCFinished(experimentSessionEntry __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
public override void eventDLCInitiated(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { }
//public override void eventCrawlJobFinished(analyticJob aJob, crawlerDomainTaskMachine __machine, modelSpiderTestRecord __tRecord) //{ // throw new NotImplementedException(); //} public override void eventDLCInitiated <TParent>(TParent __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) => eventDLCInitiated(__parent as experimentSessionEntry, __task, __wRecord);
public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { plugin_state.doCheckCriteria(__task.parent.parent, __wRecord.tRecord, this, imbWEMManager.index.experimentEntry); }