public void eventIteration(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (indexPlugIn_base plug in allPlugins) { plug.eventIteration(__parent as experimentSessionEntry, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public override void reportDomainFinished(directAnalyticReporter reporter, modelSpiderSiteRecord wRecord) { foreach (ISpiderModuleBase module in modules) { module.reportDomainFinished(reporter, wRecord); } }
public void eventDLCFinished(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <crawlReportingStageEnum, directReporterBase> plug in allPlugins) { plug.eventDLCFinished(__parent as directReporterBase, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public override void reportIteration(directAnalyticReporter reporter, modelSpiderSiteRecord wRecord) { foreach (ISpiderModuleBase module in modules) { module.reportIteration(reporter, wRecord); } }
public void eventDLCInitiated(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <crawlJobEngineStageEnum, crawlerDomainTaskMachine> plug in allPlugins) { plug.eventDLCInitiated(__parent as crawlerDomainTaskMachine, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public void eventDLCFinished(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <crawlerDomainTaskIterationPhase, spiderEvaluatorBase> plug in allPlugins) { plug.eventDLCFinished(__parent as spiderEvaluatorBase, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public void eventDLCFinished(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <indexMaintenanceStageEnum, experimentSessionEntry> plug in allPlugins) { plug.eventDLCFinished(__parent as experimentSessionEntry, __task, __wRecord); /// aJob, __machine, __tRecord); } }
/// <summary> /// Sets the failed domain. /// </summary> /// <param name="wProfile">The w profile.</param> public void SetFailedDomain(webSiteProfile wProfile, modelSpiderSiteRecord wRecord = null) { if ((wRecord == null) && (wProfile == null)) { var axe = new aceGeneralException("Supplied wProfile and wRecord are null - this task should never run on the first place", null, this, "SetFailedDomain(webSiteProfile null)"); throw axe; } else { string domain = "[unknown]"; if (wProfile != null) { domain = wProfile.domain; } if (wRecord != null) { domain = wRecord.domain; } if (!failedSample.ContainsKey(domain)) { failedSample.Add(domain, wProfile); domainFailList.Append(domain, true); } } }
public void SetTarget(modelSpiderSiteRecord wRecord, indexPage page) { link ln = new link(page.url); //spiderLink sLink = new spiderLink() wRecord.context.processLink(ln, wRecord.web.webPages.items.FirstOrDefault().Value, false); }
/// <summary> /// Evaluation procedure -- implementation for modules without layers /// </summary> /// <param name="input">The input.</param> /// <param name="wRecord">The w record.</param> /// <returns></returns> public override ISpiderModuleData evaluate(ISpiderModuleData input, modelSpiderSiteRecord wRecord) { List <spiderLink> output = new List <spiderLink>(); spiderModuleData <spiderLink> outdata = new spiderModuleData <spiderLink>(); moduleDLCRecord moduleLevelReportTable = ((spiderModuleData <spiderLink>)input).moduleDLC; moduleIterationRecord moduleDLCRecordTableEntry = ((spiderModuleData <spiderLink>)input).moduleDLCRecordTableEntry; if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { //dataInput.moduleDLCRecordTableEntry = dataInput.moduleDLC.GetOrCreate(wRecord.iteration.ToString("D3") + module.name); moduleDLCRecordTableEntry.reportEvaluateStart(input as spiderModuleData <spiderLink>, wRecord, this); // <--- module level report --- start } input.active.ForEach(x => output.Add(x as spiderLink)); // ----- this is part where the layer modules are emulated if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { moduleDLCRecordTableEntry.reportEvaluateEnd(output, wRecord, this); // <--- module level report --- start } outdata.active.AddRange(rankLinks(output, wRecord.iteration)); if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { moduleDLCRecordTableEntry.reportEvaluateAlterRanking(outdata.active, wRecord, this); // <--- module level report --- start } return(outdata); }
protected spiderTask __operation_GetLoadTaskCommon(modelSpiderSiteRecord wRecord, IEnumerable <spiderLink> activeLinks) { operation_doControlAndStats(wRecord); // <------------------------------------------------------------------------------------------ int n = wRecord.context.GetNextIterationLTSize(activeLinks); // wRecord.logBuilder.log("Creating new spiderTask for iteration " + wRecord.iteration + " with " + n + " links to load. To Limit: " + toLimit); spiderTask outputTask = new spiderTask(wRecord.iteration + 1, wRecord.web); outputTask.AddRange(activeLinks.Take(n)); foreach (var ali in activeLinks) { if (!outputTask.Contains(ali)) { ali.marks.cycleRegistration(wRecord.iteration); } } return(outputTask); }
internal crawlerErrorLog CreateAndSaveError(Exception ex, modelSpiderSiteRecord wRecord, crawlerDomainTask crawlerDomainTask, crawlerErrorEnum errorType) { crawlerErrorLog clog = crawlerErrorLog.CreateAndSave(ex, wRecord, crawlerDomainTask, errorType); clog.SaveXML(folder[DRFolderEnum.logs].pathFor("DLC_crash_" + wRecord.domainInfo.domainRootName.getFilename())); return(clog); }
public override spiderObjectiveSolution evaluate(modelSpiderSiteRecord sRecord, params object[] resources) { int c = 0; if (wRecord.web.webPages.Count() < treshold) { return(null); } int w = wRecord.web.webPages.Count() / 2; foreach (spiderPage sp in wRecord.web.webPages.items.Values) { if (sp.relationship.crossLinks.Count > w) { c++; } } if (c > treshold) { spiderObjectiveSolution sos = new spiderObjectiveSolution(objective, afirmative); return(sos); } return(null); }
public override spiderObjectiveSolution evaluate(spiderLink element, modelSpiderSiteRecord sRecord, params object[] resources) { spiderObjectiveSolution sol = new spiderObjectiveSolution(); if (scoreList.Count < 2) { return(sol); } if (wRecord.web.webActiveLinks.Count > treshold) { if (q1 == int.MinValue) { double __q1; double __q3; Measures.Quartiles(scoreList.ToArray(), out __q1, out __q3, true); q1 = Convert.ToInt32(__q1); q3 = Convert.ToInt32(__q3); } if (element.marks.score <= q1) { sol = new spiderObjectiveSolution(element, spiderObjectiveStatus.aborted); } else { } } return(sol); }
public void reportEvaluateStart(spiderModuleData <spiderLink> input, modelSpiderSiteRecord wRecord, spiderModuleBase moduleInstance) { start = DateTime.Now; iteration = wRecord.iteration; int cyclers_c = 0; int recyclers_c = 0; int cyclers_age_c = 0; int input_age = 0; foreach (spiderLink link in input.active) { inputTargets_collection.Add(link.url); if (link.marks.cycleCount > 0) { if (link.marks.cycleLastIteration == (iteration - 1)) { cyclers_c++; cyclers_age_c += iteration - link.iterationDiscovery; } else if (link.marks.cycleLastIteration < (iteration - 1)) { recyclers_c++; } } input_age += iteration - link.iterationDiscovery; } inputTargets = input.active.Count(); processed = inputTargets; // <-- razlika je samo u agregaciji age = input_age.GetRatio(inputTargets); inputTargets_assertion = imbWEMManager.index.pageIndexTable.GetUrlAssertion(inputTargets_collection); inputPotentialPrecission = inputTargets_assertion.relevant; evaluationCertainty = inputTargets_assertion.certainty; inputTargets_assertion.performInfoGainEstimation(); PotInputIP = inputTargets_assertion.IPnominal; targets = inputTargets; layerModule = moduleInstance as spiderLayerModuleBase; cyclers = cyclers_c.GetRatio(inputTargets); recyclers = recyclers_c.GetRatio(inputTargets); if (layerModule != null) { accumulation = layerModule.layers.CountAll; targets += accumulation; } }
public textByIteration this[modelSpiderSiteRecord wRecord] { get { string key = wRecord.domainInfo.domainRootName; return(items.GetOrAdd(key, new textByIteration(key))); } }
public override void learn(spiderPage element, modelSpiderSiteRecord sRecord, params object[] resources) { int cross = element.relationship.crossLinks.Count(); min = Math.Min(cross, min); max = Math.Max(cross, max); scoreList.Add(Convert.ToDouble(cross)); }
public bool CheckStage(modelSpiderSiteRecord wRecord, spiderObjectiveSolutionSet oSet, spiderTask task) { bool okToLeave = false; if (task.Count() == 0) { wRecord.logBuilder.log("> Spider task [i:" + task.iteration + "] have no tasks defined. Aborting the stage loop."); okToLeave = true; return(okToLeave); } // <----------------------------- OBJECTIVE SOLUTION SET okToLeave = operation_executeObjectiveSolutionSet(oSet, wRecord); if (okToLeave) { return(okToLeave); } // <----------------------------- SPIDER LIMITS OVERRIDERS ---------------| if (stageIteration > wRecord.tRecord.instance.settings.limitIterations) { wRecord.log("> Spider settings (limit iterations) trigered abort at [" + stageIteration + "] Aborting the stage loop."); okToLeave = true; return(okToLeave); } // <----------------------------------------------------------------------| // <----------------------------- SPIDER LIMITS OVERRIDERS ---------------| if (wRecord.web.webPages.Count() > wRecord.tRecord.instance.settings.limitTotalPageLoad) { wRecord.log("> Spider settings (limit pages load) trigered abort at [" + wRecord.web.webPages.Count() + "] Aborting the stage loop."); okToLeave = true; return(okToLeave); } // <----------------------------------------------------------------------| if (stageIteration > stageIterationLimit) { wRecord.logBuilder.log("> Stage [" + name + "] iteration limit reached [ " + stageIterationLimit + " ] -- aborting [" + objectives.Count + "] objectives and move on"); okToLeave = true; return(okToLeave); } if (stageIteration > GLOBAL_stageIterationLimit) { Exception ex = new aceGeneralException("spiderStage [" + name + "] reached the " + nameof(GLOBAL_stageIterationLimit) + "(" + GLOBAL_stageIterationLimit.ToString() + ")"); throw ex; } stageIteration++; return(okToLeave); }
/// <summary> /// Populate relationship information /// </summary> /// <param name="sRecord">The s record.</param> /// <returns></returns> public void operation_detectCrossLinks(modelSpiderSiteRecord sRecord) { // sRecord.logBuilder.log("Detection of cross links started for: " + sRecord.web.webPages.items.Count()); // Connect all foreach (KeyValuePair <string, spiderLink> ln_pair in sRecord.web.webLinks.items) { foreach (KeyValuePair <string, spiderPage> pg_pair in sRecord.web.webPages.items) { int pos = ln_pair.Key.IndexOf(pg_pair.Key); if (pos == -1) { //sRecord.logBuilder.log("No inner page was associated with hash key [" + pg_pair.Key + "] : this must be root"); } else if (pos < 5) { pg_pair.Value.relationship.outflowLinks.Add(ln_pair.Key, ln_pair.Value); } else { pg_pair.Value.relationship.inflowLinks.Add(ln_pair.Key, ln_pair.Value); } } } int totalCrossLinks = 0; sRecord.crossLinkStats.StartNew(); foreach (KeyValuePair <string, spiderPage> pg_pair in sRecord.web.webPages.items) { totalCrossLinks = 0; foreach (KeyValuePair <string, spiderLink> ln in pg_pair.Value.relationship.inflowLinks) { string inverse = ln.Value.getLinkSignature(false, true); if (sRecord.web.webLinks.items.ContainsKey(inverse)) { pg_pair.Value.relationship.crossLinks.Add(inverse, sRecord.web.webLinks.items[inverse]); totalCrossLinks++; } } sRecord.crossLinkStats.Current().Add(totalCrossLinks); } //sRecord.stats.add(modelSpiderSideFields.mss_totalcrosslinks, totalCrossLinks); //sRecord.stats.add(modelSpiderSiteTimelineEnum.tl_iteration, sRecord.iteration); //sRecord.stats.add(modelSpiderSiteTimelineEnum.tl_pagesloaded, sRecord.web.webPages.items.Count()); //sRecord.stats.add(modelSpiderSiteTimelineEnum.tl_totallinks, sRecord.web.webLinks.items.Count()); //sRecord.stats.add(modelSpiderSiteTimelineEnum.tl_activelinks, sRecord.web.webActiveLinks.items.Count()); // imbWEMManager.log.log("Detection of cross links finished: " + totalCrossLinks); }
public override spiderObjectiveSolution evaluate(modelSpiderSiteRecord sRecord, params object[] resources) { if (wRecord.web.webPages.Count() > treshold) { spiderObjectiveSolution sos = new spiderObjectiveSolution(objective, afirmative); return(sos); } return(null); }
public override void startIteration(int currentIteration, modelSpiderSiteRecord __wRecord) { layerActiveRules.ForEach(x => x.startIteration(currentIteration, __wRecord)); layerPassiveRules.ForEach(x => x.startIteration(currentIteration, __wRecord)); rankingTargetActiveRules.ForEach(x => x.startIteration(currentIteration, __wRecord)); //--- currentIteration start //rankingTargetPassiveRules.ForEach(x=>x.p) }
/// <summary> /// Creates single web loading task /// </summary> /// <param name="lnk">The LNK.</param> /// <param name="sReport">The s report.</param> /// <param name="iteration">The iteration.</param> /// <returns></returns> public virtual spiderTask getSpiderSingleTask(spiderLink lnk, modelSpiderSiteRecord sReport, int iteration) { spiderTask output = new spiderTask(iteration, sReport.web); // output.doTokenization = flags.HasFlag(spiderEvaluatorExecutionFlags.doTokenization); output.Add(lnk); return(output); }
public override spiderObjectiveSolution evaluate(modelSpiderSiteRecord sRecord, params object[] resources) { dataUnitSpiderIteration last = wRecord.timeseries.lastEntry as dataUnitSpiderIteration; if (last.st_detected_p >= treshold) { spiderObjectiveSolution sos = new spiderObjectiveSolution(objective, afirmative); return(sos); } return(null); }
public folderNode getIterationFolder(int iteration, modelSpiderSiteRecord wRecord) { if (siteRecords != null) { return(siteRecords[wRecord].Add("I" + iteration.ToString("D3"), wRecord.domainInfo.domainRootName + iteration.ToString("D3"), "Iteration " + iteration + " on domain: " + wRecord.domainInfo.domainName + ". " + iterationDescription)); } else { return(null); } }
/// <summary> /// Generates ouiteration out. /// </summary> /// <param name="wRecord">The w record.</param> /// <param name="fn">The function.</param> public void reportIterationOut(modelSpiderSiteRecord wRecord, folderNode fn) { generalRecords.GetLastEntry().saveObjectToXML(fn.pathFor("modules_performance.xml")); foreach (moduleDLCRecord mod in wRecord.frontierDLC) { var lastModEntry = mod.GetLastEntry(); if (lastModEntry != null) { lastModEntry.saveObjectToXML(fn.pathFor("module_" + mod.moduleName + ".xml")); } } }
public void EnterStage(modelSpiderSiteRecord wRecord, ISpiderEvaluatorBase sEvaluator) { wRecord.logBuilder.log("-- entering stage [" + name + "] with " + objectives.Count() + " objectives."); wRecord.logBuilder.log("> " + description + " (codename:" + codename + ")"); wRecord.logBuilder.log("> stage iteration limit: " + stageIterationLimit + " (global limit:" + GLOBAL_stageIterationLimit + ")"); foreach (spiderObjective objective in objectives) { objective.prepare(); wRecord.logBuilder.log("> Objective [" + objective.name + "] t:" + objective.supervisor + " "); } }
public override spiderObjectiveSolution evaluate(modelSpiderSiteRecord sRecord, params object[] resources) { dataUnitSpiderIteration newEntry = wRecord.timeseries.currentEntry as dataUnitSpiderIteration; if (newEntry.avg_score_l_trend < -treshold) { return(new spiderObjectiveSolution(objective, afirmative)); } else { return(new spiderObjectiveSolution(objective, denial)); } }
public spiderTargetCollection(modelSpiderSiteRecord __wRecord) { wRecord = __wRecord; string __domain = wRecord.domain; string __spiderName = wRecord.spider.name; name = __spiderName + " targets on " + __domain; description = "Registry of unique absolute URLs discovered on the web site: " + __domain + " by the " + __spiderName + " crawler"; dlTargetLinkTokens = new termDocumentSet(GetHash("links_" + __domain + " " + __spiderName), "URL and anchor text tokens from links discovered on the web site: " + __domain + " by the " + __spiderName + " crawler"); if (wRecord.tRecord.instance.settings.doEnableDLC_TFIDF) { dlTargetPageTokens = new termDocumentSet(GetHash("pages_" + __domain + " " + __spiderName), "Content text tokens from loaded pages the web site: " + __domain + " by the " + __spiderName + " crawler"); } }
public void SetActiveTargets(modelSpiderSiteRecord wRecord, indexDomain domain) { List <indexPage> pages = domain.getPageSet(); // wRecord.web.setSeedUrl(domain.url); //spiderPage sp = new spiderPage() crawledPage cpage = new crawledPage(domain.url, 0); spiderPage spage = new spiderPage(cpage, 0, 0); foreach (indexPage p in pages) { link l = new link(p.url); wRecord.context.processLink(l, spage, false); } }
public virtual void startIteration(int currentIteration, modelSpiderSiteRecord __wRecord) { foreach (IRuleBase rule in rules) { if (rule is IRuleActiveBase) { IRuleActiveBase rule_IRuleActiveBase = (IRuleActiveBase)rule; rule_IRuleActiveBase.startIteration(currentIteration, __wRecord); } else if (rule is layerDistributionRuleBase) { layerDistributionRuleBase rule_layerDistributionRuleBase = (layerDistributionRuleBase)rule; rule_layerDistributionRuleBase.startIteration(currentIteration, __wRecord); } } }