protected spiderModuleBase(string __name, string __desc, ISpiderEvaluatorBase __parent) { name = __name; code = name[0].ToString().ToUpper(); description = __desc; _parent = __parent; }
public void eventDLCInitiated(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { foreach (IPlugInCommonBase <crawlReportingStageEnum, directReporterBase> plug in allPlugins) { plug.eventDLCInitiated(null, __task, __wRecord); /// aJob, __machine, __tRecord); } }
public languageModule(ISpiderEvaluatorBase __parent, basicLanguage __langA, basicLanguage __langB) : base("Language Module", "The Targets are distributed into layers by the Passive rules and Active rules testing the tokens of the Target.", __parent) { languageA = __langA; languageB = __langB; // setup(); }
public layerDistributionRuleBase(string __name, string __description, int __layerID, ISpiderEvaluatorBase __parent, int __layer2ID = 0) : base() { parent = __parent; name = __name; description = __description; layerID = __layerID; layer2ID = __layer2ID; }
public spiderEvalRuleForLinkBase(string __name, string __description, int __scoreUnit, int __penaltyUnit, ISpiderEvaluatorBase __parent) { name = __name; description = __description; parent = __parent; penaltyUnit = __penaltyUnit; scoreUnit = __scoreUnit; }
public layerLanguageTFIDF_ARule(basicLanguage __language, int __layerID, ISpiderEvaluatorBase __parent, int __layerID2 = -1) : base("Language TF-IDF Test ({0})", "Tests Target tokens against the specified language [{0}], sets layerID [{1}] and calculates layer weight score as sum of matched Target token TF-IDFs minus sum of unmatched." + "If resulting weight score is more than 0 the layerID [{1}] is assigned, if it's less than 0 then the layer2ID [{2}] is assigned", __layerID, __parent, __layerID2) { language = __language; name = string.Format(name, language.languageEnglishName); description = string.Format(description, language.languageEnglishName, layerID, layer2ID); }
public static String GetCrawlFolderName(ISpiderEvaluatorBase spider, crawlerDomainTaskMachineSettings crawlerJobEngineSettings, String templateString) { stringTemplate template = new stringTemplate(templateString); PropertyCollection data = GetData(crawlerJobEngineSettings, spider); return(template.applyToContent(data)); }
public static String GetCrawlFolderName(ISpiderEvaluatorBase spider, ICrawlJobContext state, String templateString) { stringTemplate template = new stringTemplate(templateString); PropertyCollection data = GetData(state, spider); return(template.applyToContent(data)); }
public spiderEvalRuleForPageBase(string __name, string __description, int __scoreUnit, int __penaltyUnit, ISpiderEvaluatorBase __parent) { name = __name; description = __description; parent = __parent; scoreUnit = __scoreUnit; penaltyUnit = __penaltyUnit; mode = spiderEvalRuleResultEnum.active; }
public rankPageRank(ISpiderEvaluatorBase __parent, double __alpha = 0.85, double __convergence = 0.0001, int __checkSteps = 10) : base("PageRankRule", "Integrates PageRank algorithm included from C# open source project [https://github.com/jeffersonhwang/pagerank]. Dumping value (d) is set to:" + __alpha.ToString() + ", convergence (c): " + __convergence + ", check steps: " + __checkSteps + ". Targets are ranked by associated page rank multiplied to integer factor: {0}. When no pages were loaded it assigns maximum score." , 1000, 10, __parent) { description = string.Format(description, scoreUnit); alpha = __alpha; convergence = __convergence; checkSteps = __checkSteps; }
public diversityModule(double ttd, double ptd, ISpiderEvaluatorBase __parent, int expansionSteps) : base("Diversity Module", "Use inversed semantic similarity between a target and Target link url/title corpus and Target page content corpus", __parent) { tt_diversityFactor = ttd; pt_diversityFactor = ptd; termExpansionSteps = expansionSteps; // setup(); }
public templateModule(bool isRankingAlternative, ISpiderEvaluatorBase __parent) : base("Template Module", "The Targets are distributed into layers according to the semantic role of the host content block. ", __parent) { alternative = isRankingAlternative; if (alternative) { description = description + "This is alternative implementation with XPath depth based ranking"; } // setup(); }
public structureModule(bool isRankingAlternative, ISpiderEvaluatorBase __parent) : base("Structure Module", "The Targets are distributed into layers according to the URL-structure graph tree", __parent) { alternative = isRankingAlternative; if (alternative) { description = description + "This is alternative implementation with URL structure graph based ranking"; } //setup(); }
public void EnterStage(modelSpiderSiteRecord wRecord, ISpiderEvaluatorBase sEvaluator) { wRecord.logBuilder.log("-- entering stage [" + name + "] with " + objectives.Count() + " objectives."); wRecord.logBuilder.log("> " + description + " (codename:" + codename + ")"); wRecord.logBuilder.log("> stage iteration limit: " + stageIterationLimit + " (global limit:" + GLOBAL_stageIterationLimit + ")"); foreach (spiderObjective objective in objectives) { objective.prepare(); wRecord.logBuilder.log("> Objective [" + objective.name + "] t:" + objective.supervisor + " "); } }
public rankHITS(ISpiderEvaluatorBase __parent, double __convergence = 0.0001, int __checkSteps = 10) : base("rankTargetsHITS", "Adaptation of HITS algorithm for inner crawl frontier ranking. Convergence (c): " + __convergence + ", iteration steps: " + __checkSteps + ". Targets are ranked by associated Authority + Hub score multiplied by integer factor: {0}." , 1000, 0, __parent) { description = string.Format(description, scoreUnit); convergence = __convergence; checkSteps = __checkSteps; hits = new HITSRank(); }
public rankDiversityALink(ISpiderEvaluatorBase __parent, double __target_sd = 0.5, double __page_sd = 0.5, int __expansionSteps = 3) : base("TokenDiversity", "Ranks links by semantic diversity (inverted similarity) against Target tokens (" + __target_sd.ToString("P") + ") and crawled pages tokens (" + __page_sd.ToString("P") + ")." + "Target tokens are semantically expanded in [" + __expansionSteps + "] steps using Semantic Lexicon" , 100000, 0, __parent) { expansionSteps = __expansionSteps; target_sd = __target_sd; page_sd = __page_sd; //subject = spiderEvalRuleSubjectEnum.targets; mode = spiderEvalRuleResultEnum.active; doAdjustScoreByLanguageDetection = imbWEMManager.settings.crawlAdHok.FLAG_doAdjustDiversityScore; __parent.settings.doEnableDLC_TFIDF = true; }
public static PropertyCollection GetData(ICrawlJobContext state, ISpiderEvaluatorBase crawler) { PropertyCollection data = new PropertyCollection(); data[nameComposerFields.crawlerClassName] = crawler.GetType().Name; data[nameComposerFields.crawlerTitleName] = crawler.name; data[nameComposerFields.crawlerFileFriendlyName] = crawler.name.getCleanFilepath().Replace("-", ""); data[nameComposerFields.variablePLmax] = crawler.settings.limitTotalPageLoad; data[nameComposerFields.variableLT] = crawler.settings.limitIterationNewLinks; //data[nameComposerFields.variableTCmax] = state.crawlerJobEngineSettings.TC_max; data[nameComposerFields.sampleSize] = state.sampleList.Count(); //data[nameComposerFields.sampleFileSource] = state.sampleFile; //data[nameComposerFields.sampleName] = state.sampleTags; return(data); }
public static void SetTestIDAndSignature(this IReportBenchmark target, ISpiderEvaluatorBase evaluator, ICrawlJobContext state, modelSpiderTestRecord tRecord) { target.TestSignature = evaluator.name + "|DC:" + state.sampleList.Count + "|PL:" + evaluator.settings.limitTotalPageLoad + "|LT:" + evaluator.settings.limitIterationNewLinks + "|IID:" + imbWEMManager.index.current_indexID + "|SID:" + imbWEMManager.index.indexSessionEntry.SessionID.add(evaluator.settings.SignatureSufix); target.Crawler = tRecord.instance.name; if (evaluator != null) { target.TestID = md5.GetMd5Hash(objectSerialization.ObjectToXML(imbWEMManager.settings)) + "-" + evaluator.crawlerHash + "-" + md5.GetMd5Hash(target.TestSignature).toWidthMaximum(3, ""); } else if (state != null) { target.TestID = state.setupHash_global + "-" + state.setupHash_crawler; } else { target.TestID = md5.GetMd5Hash(target.GetHashCode().ToString()); } }
public void eventIteration(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { if (!IsEnabled) { return; } foreach (reportPlugIn_base plug in allPlugins) { try { plug.eventIteration(__spider, __task, __wRecord); } catch (Exception ex) { aceLog.log("Reporting Plugin [" + plug.name + "]:" + plug.GetType().Name + " at status report execution crashed: " + ex.Message); crawlerErrorLog cel = new crawlerErrorLog(ex, null, null, crawlerErrorEnum.indexPlugin); cel.SaveXML(); } } }
public static void Describe(this ISpiderEvaluatorBase evaluator, ITextRender output) { output.AppendHeading("Crawler [" + evaluator.name + "]"); output.AppendPair("Class name", evaluator.GetType().Name, true, ": "); output.AppendPair("Description", evaluator.description, true, ": "); if (evaluator is spiderModularEvaluatorBase) { spiderModularEvaluatorBase evaluator_spiderModularEvaluatorBase = (spiderModularEvaluatorBase)evaluator; foreach (var md in evaluator_spiderModularEvaluatorBase.modules) { md.DescribeModule(output); } } if (evaluator is spiderEvaluatorSimpleBase) { spiderEvaluatorSimpleBase evaluator_spiderEvaluatorSimpleBase = (spiderEvaluatorSimpleBase)evaluator; evaluator_spiderEvaluatorSimpleBase.linkActiveRules.ToList().ForEach(x => x.DescribeRule(output)); } output.AppendHorizontalLine(); output.open("div", "General configuration", "Crawler configuration properties declared in common settings class"); evaluator.settings.GetUserManual(output, "", true, true); output.close(); }
public abstract void eventIteration(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
public layerBlockRolePRule(nodeBlockSemanticRoleEnum __semanticRole, int __layerID, ISpiderEvaluatorBase __parent, int __layer2ID = -1) : base("Block Role ({0})", "Links found in a block with semantic role [{0}] are set to layerID [{1}], otherwise to [{2}].", __layerID, __parent, __layer2ID) //: base(__name, __description, __layerID, __parent, __layer2ID) { semanticRole = __semanticRole; name = string.Format(name, semanticRole); description = string.Format(description, semanticRole, layerID, layer2ID); __parent.settings.doEnableDLC_BlockTree = true; }
/// <summary> /// Just when new DLC thread was prepared to run /// </summary> /// <param name="__spider">The spider.</param> /// <param name="__task">The task.</param> /// <param name="__wRecord">The w record.</param> public abstract void eventDLCInitiated(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
public abstract void eventUniversal <TFirst, TSecond>(crawlerDomainTaskIterationPhase stage, ISpiderEvaluatorBase __parent, TFirst __task, TSecond __resource);
/// <summary> /// Events the plugin installed --- when new spider instance being constructed /// </summary> /// <param name="__spider">The spider.</param> /// <param name="__task">The task.</param> public abstract void eventPluginInstalled(ISpiderEvaluatorBase __spider);
public rankTargetUrlGraph(ISpiderEvaluatorBase __parent, int __scoreUnit = 100) : base("Target URL Graph", "In the Learning phase creates link-path graph out of all links sent for evaluation, according to normalized score of matched node it will assign proportion of [{0}].", __scoreUnit, 0, __parent) { description = string.Format(description, __scoreUnit); }
protected spiderLayerModuleBase(string __name, string __desc, ISpiderEvaluatorBase __parent) : base(__name, __desc, __parent) { }
public spiderRankingModuleBase(string __name, string __desc, ISpiderEvaluatorBase __parent) : base(__name, __desc, __parent) { }
public ruleRandomNoiseScore(int maximum, int minimum, ISpiderEvaluatorBase __parent) : base("NoiseScore", "Passive rule adding random score between {0} and {1} points - used to create initial noise in the frontier. ", maximum, minimum, __parent) { description = string.Format(description, maximum, minimum); rnd = new Random(); }
public void reportIteration(dataUnitSpiderIteration dataUnit, modelSpiderSiteRecord wRecord, ISpiderEvaluatorBase evaluator) { iterationPerformanceRecord ip_record = new iterationPerformanceRecord(wRecord); wRecord.iterationTableRecord.Add(ip_record); folderNode fn; //siteRecords[wRecord].Add(dataUnit.iteration.ToString("D3"), wRecord.domainInfo.domainRootName + dataUnit.iteration.ToString("D3"), "Iteration " + dataUnit.iteration + " on domain: " + wRecord.domainInfo.domainName); if (imbWEMManager.settings.directReportEngine.doIterationReport) { if (imbWEMManager.settings.directReportEngine.doDomainReport) { fn = getIterationFolder(dataUnit.iteration, wRecord); if (REPORT_WRECORD_LOG) { wRecord.logBuilder.getLastLine().saveStringToFile(fn.pathFor("wrecord.txt")); } string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath(); textByIteration url_loaded = urlsLoaded[wRecord]; //.GetOrAdd(wRecord, new textByIteration()); textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration()); //textByIteration terms_ext = termsExtracted[wRecord]; //textByIteration sentence_ext = sentencesExtracted[wRecord]; if (REPORT_MODULES) { if (imbWEMManager.settings.directReportEngine.DR_ReportModules_XMLIteration) { if (wRecord.tRecord.instance is spiderModularEvaluatorBase) { wRecord.frontierDLC.reportIterationOut(wRecord, fn); } } } string its = dataUnit.iteration.ToString("D3"); //DataTable dt = wRecord.context.targets.GetDataTable(); //dt.SetTitle(fileprefix + "_targets"); //dt.serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.csv, "", fn, notation); //sentence_ext[dataUnit.iteration].AddRangeUnique(wRecord.context.targets.blocks.GetHashList()); //if (REPORT_ITERATION_TERMS) //{ // fileunit blocks = new fileunit(fn.pathFor(its + "_blc.txt"), false); // blocks.setContentLines(sentence_ext[dataUnit.iteration]); // blocks.Save(); //} if (REPORT_TIMELINE) { objectSerialization.saveObjectToXML(ip_record, fn.pathFor("performance.xml")); } if (REPORT_ITERATION_URLS) { if (wRecord.iteration > 0) { builderForMarkdown now_loaded = new builderForMarkdown(); //fileunit now_loaded = new fileunit(fn.pathFor(its + "_loadedNow.txt"), false); List <spiderTarget> targets_loaded = wRecord.context.targets.GetLoadedInIteration(wRecord.iteration - 1); int tc = 0; foreach (spiderTarget t in targets_loaded) { reportTarget(t, fn, tc); now_loaded.AppendLine(t.url); now_loaded.AppendHorizontalLine(); now_loaded.Append(t.marks.GetActiveResults()); now_loaded.AppendHorizontalLine(); now_loaded.Append(t.marks.GetPassiveResults()); now_loaded.AppendHorizontalLine(); var dt = t.marks.getHistory(t.url, wRecord.tRecord.instance.name); dt.Save(fn, imbWEMManager.authorNotation, its + "_loadedNow"); now_loaded.AppendTable(dt, false); tc++; } now_loaded.ToString().saveStringToFile(fn.pathFor(its + "_loadedNow.txt")); spiderTaskResult loadResults = wRecord.spiderTaskResults[wRecord.iteration - 1]; loadResults.getDataTable().GetReportAndSave(fn, notation, "loadResults", true); // .serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.excel, "loadResults", fn, notation); } fileunit detected = new fileunit(fn.pathFor(its + "_dt.txt"), false); fileunit loaded = new fileunit(fn.pathFor(its + "_ld.txt"), false); fileunit relp = new fileunit(fn.pathFor(its + "_srb_ld.txt"), false); relp.Append(wRecord.relevantPages, true); foreach (spiderTarget t in wRecord.context.targets) { if (t.page != null) { //t.contentBlocks.ForEach(x => sentence_ext[dataUnit.iteration].AddUnique(x.textHash)); loaded.Append(t.url); url_loaded[dataUnit.iteration].Add(t.url); } else { detected.Append(t.url); url_detected[dataUnit.iteration].Add(t.url); } } string lineFormat = "{0,5} {1,30} [s:{1,6}]" + Environment.NewLine; fileunit active = new fileunit(fn.pathFor(its + "_act.txt"), false); int c = 1; foreach (var lnk in wRecord.web.webActiveLinks) { active.Append(string.Format(lineFormat, c, lnk.url, lnk.marks.score)); active.Append(lnk.marks.GetLayerAssociation()); c++; } detected.Save(); loaded.Save(); active.Save(); } } } wRecord.tRecord.instance.reportIteration(this, wRecord); }