/// <summary> /// Accepts the loaded pages. /// </summary> /// <param name="stResult">The st result.</param> /// <returns></returns> public int acceptLoadedPages(spiderTaskResult stResult) { int linkFlush = 0; foreach (spiderTaskResultItem cresult in stResult) { if (wRecord.web.webActiveLinks.items.Remove(cresult.target)) { linkFlush++; } } wRecord.logBuilder.log("[" + linkFlush.ToString() + "] active links were removed from the set as spiderTaskResult processed them."); return(linkFlush); }
/// <summary> /// E1: Operations the receive result. /// </summary> /// <param name="stResult">The st result.</param> /// <param name="wRecord">The s record.</param> public dataUnitSpiderIteration operation_receiveResult(spiderTaskResult stResult, modelSpiderSiteRecord wRecord) { dataUnitSpiderIteration iDataUnit = wRecord.timeseries[stResult.task.iteration]; wRecord.logBuilder.log("Received: " + stResult.Count() + " (it:" + stResult.task.iteration + ")"); wRecord.iteration = stResult.task.iteration; if (stResult.Any()) { wRecord.spiderTaskResults.Add(stResult); } int targetCount = wRecord.web.webTargets.items.Count(); int linkFlush = wRecord.context.acceptLoadedPages(stResult); int nw_failed_l = wRecord.context.processLoaderResult(stResult, settings.FRONTIER_doLinkResolver, settings.FRONTIER_doLinkHarvest, settings.FRONTIER_harvestNature, settings.FRONTIER_harvestScope); int newLinks = wRecord.web.webTargets.items.Count() - targetCount; iDataUnit.nw_detected_l = newLinks; iDataUnit.nw_failed_l = nw_failed_l; iDataUnit.tc_detected_l = wRecord.web.webLinks.items.Count(); iDataUnit.tc_loaded_p = wRecord.web.webPages.items.Count(); iDataUnit.tc_detected_p = wRecord.web.webTargets.items.Count(); iDataUnit.tc_ingame_l = wRecord.web.webActiveLinks.items.Count(); iDataUnit.nw_processed_l = linkFlush; //sRecord.timeline.timeSeries[sRecord.iteration] = new PropertyCollectionExtended(); //sRecord.timeline.timeSeries[sRecord.iteration].add(modelSpiderSiteTimelineEnum.tl_iteration, sRecord.iteration); //sRecord.timeline.timeSeries[sRecord.iteration].add(modelSpiderSiteTimelineEnum.tl_pagesloaded, ); //sRecord.timeline.timeSeries[sRecord.iteration].add(modelSpiderSiteTimelineEnum.tl_totallinks, sRecord.web.webLinks.items.Count()); //sRecord.timeline.timeSeries[sRecord.iteration].add(modelSpiderSiteTimelineEnum.tl_activelinks, ); //sRecord.timeline.timeSeries[sRecord.iteration].add(modelSpiderSiteTimelineEnum.tl_tasksize, stResult.task.Count()); //sRecord.timeline.timeSeries[sRecord.iteration].add(modelSpiderSiteTimelineEnum.tl_newlinks, newLinks); wRecord.logBuilder.log("Active links [" + wRecord.web.webActiveLinks.items.Count() + "] change [" + newLinks + "]"); return(iDataUnit); }
/// <summary> /// Runs the spider task. /// </summary> /// <param name="sTask">The s task.</param> /// <param name="crawlerContext">The crawler context.</param> /// <returns></returns> public spiderTaskResult runSpiderTask(spiderTask sTask, modelSpiderSiteRecord wRecord) { spiderTaskResult sResult = sTask.createResult(); try { if (imbWEMManager.settings.crawlerJobEngine.crawlerDoParallelTaskLoads) { Parallel.ForEach(sTask, ln => { modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url); spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord); if (rItem.status != pageStatus.failed) { wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); // <-------------------------------- [ STIZE } sResult.AddResult(rItem); }); } else { foreach (spiderLink ln in sTask) { modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url); spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord); if (rItem.status != pageStatus.failed) { wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); } sResult.AddResult(rItem); } } } catch (Exception ex) { imbWEMManager.log.log("runSpiderTask exception: " + ex.Message); } loadIndex = loadIndex + sResult.Count(); if (loadIndex > imbWEMManager.settings.crawlerJobEngine.loadCountForGC) { long mem = GC.GetTotalMemory(false); GC.Collect(); GC.WaitForFullGCComplete(); long dmem = GC.GetTotalMemory(false); aceLog.log("Memory allocation reduction [after " + loadIndex + " tasks]: " + (mem - dmem).getMByteCountFormated()); loadIndex = 0; } sResult.finish(); return(sResult); }
/// <summary> /// Processes loader result /// </summary> /// <param name="stResult">The st result.</param> /// <param name="doLinkResolver">Performs LinkResolver component tasks over each harvested link</param> /// <param name="doLinkHarvest">Extract designated linkNature and linkScope from the content</param> /// <param name="nature">The nature of links to harvest - flags</param> /// <param name="scope">The scope of links to harvest - flags</param> /// <returns>Number of newly added links</returns> public int processLoaderResult(spiderTaskResult stResult, bool doLinkResolver = true, bool doLinkHarvest = true, linkNature nature = linkNature.navigation, linkScope scope = linkScope.inner) { int nw_failed_l = 0; foreach (spiderTaskResultItem cresult in stResult) // <--------------------------------------------------------------------------------- prolazi kroz sve učitane stranice { //cresult.page; spiderPage pg = cresult.sPage; //new spiderPage(cresult.page, wRecord.iteration); // <--------------------------------------------------------------- instancira spiderPage modelWebPageGeneralRecord pGeneralRecord = null; if (cresult.status != pageStatus.failed) { web.webPageContentHashList.AddInstance(pg.contentHash, 1); if (web.webPageContentHashList[pg.contentHash] > 1) { if (imbWEMManager.settings.executionLog.doPageErrorOrDuplicateLog) { aceLog.log("Page [" + pg.url + "] - is content duplicate "); } wRecord.listOfDuplicatedPages.Add(new contentHashAndAddressEntry(pg.url, pg.contentHash, web.webPageContentHashList[pg.contentHash])); wRecord.duplicateCount++; var t = targets.GetByTarget(cresult.target); if (t != null) { t.isDuplicate = true; } continue; } // <-------------------------------------------------- instancira pGeneralRecord // pGeneralRecord = wRecord.wGeneralRecord.children.GetRecord(pg.webpage, true); } cresult.target.targetedPage = pg; // <-------------------------------------------------------------------------------------------- upisuje u link referencu stranice if (!wRecord.web.webPages.Add(pg)) // <--------------------------------------------------------------------------------------------- registruje stranicu u webPages skup { wRecord.logBuilder.log("Web page [" + pg.url + "] was loaded before - check the algorithm"); } if (cresult.status != pageStatus.failed) { if (doLinkHarvest) { List <link> links = cresult.page.links.Where <link>(x => (x.nature.HasFlag(nature) && x.scope.HasFlag(scope))).ToList(); // <---------------- izdvaja linkove sa stranice int length = links.Count; for (int i = 0; i < length; i++)// <------------------------------------------------------------------------------------------------- iteracija kroz linkove { processLink(links[i] as link, cresult.sPage); } } } else { nw_failed_l++; } cresult.dispose(); } if (OnLoaderTaskProcessed != null) { OnLoaderTaskProcessed(wRecord, new modelSpiderSiteRecordEventArgs(stResult)); } return(nw_failed_l); }
public modelSpiderSiteRecordEventArgs(spiderTaskResult __result, modelSpiderSiteRecordEventType __type = modelSpiderSiteRecordEventType.DLCTaskProcessed) { type = __type; LoadResult = __result; }
public void reportIteration(dataUnitSpiderIteration dataUnit, modelSpiderSiteRecord wRecord, ISpiderEvaluatorBase evaluator) { iterationPerformanceRecord ip_record = new iterationPerformanceRecord(wRecord); wRecord.iterationTableRecord.Add(ip_record); folderNode fn; //siteRecords[wRecord].Add(dataUnit.iteration.ToString("D3"), wRecord.domainInfo.domainRootName + dataUnit.iteration.ToString("D3"), "Iteration " + dataUnit.iteration + " on domain: " + wRecord.domainInfo.domainName); if (imbWEMManager.settings.directReportEngine.doIterationReport) { if (imbWEMManager.settings.directReportEngine.doDomainReport) { fn = getIterationFolder(dataUnit.iteration, wRecord); if (REPORT_WRECORD_LOG) { wRecord.logBuilder.getLastLine().saveStringToFile(fn.pathFor("wrecord.txt")); } string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath(); textByIteration url_loaded = urlsLoaded[wRecord]; //.GetOrAdd(wRecord, new textByIteration()); textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration()); //textByIteration terms_ext = termsExtracted[wRecord]; //textByIteration sentence_ext = sentencesExtracted[wRecord]; if (REPORT_MODULES) { if (imbWEMManager.settings.directReportEngine.DR_ReportModules_XMLIteration) { if (wRecord.tRecord.instance is spiderModularEvaluatorBase) { wRecord.frontierDLC.reportIterationOut(wRecord, fn); } } } string its = dataUnit.iteration.ToString("D3"); //DataTable dt = wRecord.context.targets.GetDataTable(); //dt.SetTitle(fileprefix + "_targets"); //dt.serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.csv, "", fn, notation); //sentence_ext[dataUnit.iteration].AddRangeUnique(wRecord.context.targets.blocks.GetHashList()); //if (REPORT_ITERATION_TERMS) //{ // fileunit blocks = new fileunit(fn.pathFor(its + "_blc.txt"), false); // blocks.setContentLines(sentence_ext[dataUnit.iteration]); // blocks.Save(); //} if (REPORT_TIMELINE) { objectSerialization.saveObjectToXML(ip_record, fn.pathFor("performance.xml")); } if (REPORT_ITERATION_URLS) { if (wRecord.iteration > 0) { builderForMarkdown now_loaded = new builderForMarkdown(); //fileunit now_loaded = new fileunit(fn.pathFor(its + "_loadedNow.txt"), false); List <spiderTarget> targets_loaded = wRecord.context.targets.GetLoadedInIteration(wRecord.iteration - 1); int tc = 0; foreach (spiderTarget t in targets_loaded) { reportTarget(t, fn, tc); now_loaded.AppendLine(t.url); now_loaded.AppendHorizontalLine(); now_loaded.Append(t.marks.GetActiveResults()); now_loaded.AppendHorizontalLine(); now_loaded.Append(t.marks.GetPassiveResults()); now_loaded.AppendHorizontalLine(); var dt = t.marks.getHistory(t.url, wRecord.tRecord.instance.name); dt.Save(fn, imbWEMManager.authorNotation, its + "_loadedNow"); now_loaded.AppendTable(dt, false); tc++; } now_loaded.ToString().saveStringToFile(fn.pathFor(its + "_loadedNow.txt")); spiderTaskResult loadResults = wRecord.spiderTaskResults[wRecord.iteration - 1]; loadResults.getDataTable().GetReportAndSave(fn, notation, "loadResults", true); // .serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.excel, "loadResults", fn, notation); } fileunit detected = new fileunit(fn.pathFor(its + "_dt.txt"), false); fileunit loaded = new fileunit(fn.pathFor(its + "_ld.txt"), false); fileunit relp = new fileunit(fn.pathFor(its + "_srb_ld.txt"), false); relp.Append(wRecord.relevantPages, true); foreach (spiderTarget t in wRecord.context.targets) { if (t.page != null) { //t.contentBlocks.ForEach(x => sentence_ext[dataUnit.iteration].AddUnique(x.textHash)); loaded.Append(t.url); url_loaded[dataUnit.iteration].Add(t.url); } else { detected.Append(t.url); url_detected[dataUnit.iteration].Add(t.url); } } string lineFormat = "{0,5} {1,30} [s:{1,6}]" + Environment.NewLine; fileunit active = new fileunit(fn.pathFor(its + "_act.txt"), false); int c = 1; foreach (var lnk in wRecord.web.webActiveLinks) { active.Append(string.Format(lineFormat, c, lnk.url, lnk.marks.score)); active.Append(lnk.marks.GetLayerAssociation()); c++; } detected.Save(); loaded.Save(); active.Save(); } } } wRecord.tRecord.instance.reportIteration(this, wRecord); }
/// <summary> /// Starts this instance. /// </summary> public void start() { iterationStatus = crawlerDomainTaskIterationPhase.iterationProcessNotStarted; status = crawlerDomainTaskStatusEnum.working; executionThread = Thread.CurrentThread; if (status == crawlerDomainTaskStatusEnum.aborted) { aceLog.log("Aborted DomainTask --> start()"); return; } lastIterationStart = DateTime.Now; startTime = DateTime.Now; aceLog.consoleControl.setAsOutput(wRecord, "" + wProfile.domain); parent.parent.reportPlugins.eventUniversal(crawlReportingStageEnum.DLCPreinitiation, reporter, this, wRecord); try { iterationStatus = crawlerDomainTaskIterationPhase.loadingSeedPage; // <--- STAGE 1 spiderWebLoader loader = new spiderWebLoader(parent.parent.dataLoadTaker); loader.controler = parent.parent.webLoaderControler; stageControl.prepare(); spiderTask sTask = evaluator.getSpiderSingleTask(web.seedLink, wRecord, 1); // <-------- spiderTaskResult sResult = loader.runSpiderTask(sTask, wRecord); // <--------------------------------------------------------[ izvršava if (sResult.calculateSuccessRate() == 0) { wRecord.log("Domain [" + wRecord.domain + "] is considered as failed since landing page load failed"); parent.parent.webLoaderControler.SetFailedDomain(wProfile, wRecord); } spiderObjectiveSolutionSet solSet = null; stageControl.stage.EnterStage(wRecord, evaluator); parent.parent.plugins.eventDLCInitiated(parent.parent, this, wRecord); //.eventDLCFinished(parent.parent, this, wRecord); evaluator.plugins.eventDLCInitiated(evaluator as spiderEvaluatorBase, this, wRecord); imbWEMManager.index.plugins.eventDLCInitiated(imbWEMManager.index.experimentEntry, this, wRecord); parent.parent.reportPlugins.eventDLCInitiated(reporter, this, wRecord); int lastTermCount = 0; // <--- STAGE 2 do { iterationStatus = crawlerDomainTaskIterationPhase.iterationStart; lastIterationStart = DateTime.Now; dataUnitSpiderIteration iDataUnit = wRecord.timeseries.CreateEntry(null, sTask.iteration); iterationStatus = crawlerDomainTaskIterationPhase.receiveResult; if (imbWEMManager.MASTERKILL_SWITCH) { aceLog.log("MASTERKILL SWITCH ON :: crawlerDomainTask->" + iterationStatus.ToString()); isStageAborted = true; sResult.items.Clear(); sResult.task.Clear(); evaluator.settings.limitIterations = wRecord.iteration - 5; evaluator.settings.limitTotalPageLoad = 0; Closing(); return; } if (isStageAborted) { Closing(); return; } evaluator.plugins.processLoaderResult(sResult, wRecord, this); // wRecord.context.targets.termsAll.Count(); var iter = wRecord.iterationTableRecord.GetLastEntryTouched(); if (iter != null) { lastTermCount = iter.terms_all; } evaluator.operation_receiveResult(sResult, wRecord); // __tc = wRecord.context.targets.termsAll.Count() - __tc; if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.applyLinkRules; evaluator.plugins.processAfterResultReceived(wRecord, this); solSet = evaluator.operation_applyLinkRules(wRecord); if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.getLoadTask; sTask = evaluator.operation_GetLoadTask(wRecord); if (isStageAborted) { Closing(); return; } iterationStatus = crawlerDomainTaskIterationPhase.loadingTask; if (isLoaderDisabled) { wRecord.log("-- Loader component is disabled for this [" + wRecord.domain + "] task."); sResult = new spiderTaskResult(); } else { sResult = loader.runSpiderTask(sTask, wRecord); } if (isStageAborted) { Closing(); return; } parent.parent.dataLoadTaker.AddIteration(); iterationStatus = crawlerDomainTaskIterationPhase.updatingData; if (evaluator.settings.doEnableCrossLinkDetection) { evaluator.operation_detectCrossLinks(wRecord); } iDataUnit.checkData(); targetLoaded = iDataUnit.tc_loaded_p; targetDetected = iDataUnit.tc_detected_p; if (reporter != null) { try { int lTC = 0; var iter2 = wRecord.iterationTableRecord.GetLastEntryTouched(); if (iter2 != null) { lTC = iter2.terms_all - lastTermCount; } reporter.reportIteration(iDataUnit, wRecord, evaluator); // <------ ovde se kreira nova iteracija imbWEMManager.index.plugins.eventIteration(imbWEMManager.index.experimentEntry, this, wRecord); parent.parent.dataLoadTaker.AddContentPage(lTC, sResult.Count); } catch (Exception ex) { throw new aceGeneralException(ex.Message, ex, reporter, "Reporter.reportIteration() exception"); } } parent.parent.reportPlugins.eventIteration(evaluator, this, wRecord); iterationStatus = crawlerDomainTaskIterationPhase.checkingRules; if (targetLoaded >= evaluator.settings.limitTotalPageLoad) { isStageAborted = true; wRecord.log("--- Loaded pages count meet limit [" + targetLoaded + "] on iteration [" + iDataUnit.iteration + "]."); } if (iDataUnit.iteration >= evaluator.settings.limitIterations) { isStageAborted = true; wRecord.log("--- Iteration limit reached [" + iDataUnit.iteration + "]."); } if (DateTime.Now.Subtract(startTime).TotalMinutes >= parent.parent._timeLimitForDLC) { isStageAborted = true; wRecord.log("--- Timeout : crawler domain task [" + wRecord.web.seedLink.url + "] aborted after [" + DateTime.Now.Subtract(startTime).TotalMinutes + "] minutes."); } if (isStageAborted) { break; } } while ((!stageControl.stage.CheckStage(wRecord, solSet, sTask)) && !isStageAborted); iterationStatus = crawlerDomainTaskIterationPhase.pageEvaluation; // <---- STAGE 3 wRecord.resultPageSet = evaluator.operation_evaluatePages(wRecord); Closing(); } catch (Exception ex) { crawlerErrorEnum errorType = crawlerErrorEnum.domainTaskError; switch (iterationStatus) { case crawlerDomainTaskIterationPhase.applyLinkRules: errorType = crawlerErrorEnum.spiderModuleError; break; case crawlerDomainTaskIterationPhase.getLoadTask: errorType = crawlerErrorEnum.spiderGetTaskError; break; case crawlerDomainTaskIterationPhase.loadingTask: errorType = crawlerErrorEnum.spiderLoadingError; break; case crawlerDomainTaskIterationPhase.pageEvaluation: errorType = crawlerErrorEnum.spiderModuleError; break; } string domainName = wRecord.domainInfo.domainName; if (!tRecord.crashedDomains.Contains(domainName)) { wRecord.log("Domain crashed first time: " + ex.Message); aceLog.log("Domain [" + domainName + "] crashed first time: " + ex.Message); aceLog.log("Domain [" + domainName + "] is restarting... "); status = crawlerDomainTaskStatusEnum.waiting; tRecord.crashedDomains.Add(wRecord.domainInfo.domainName); reInitialization(); start(); } else { status = crawlerDomainTaskStatusEnum.aborted; wRecord.log("Aborted by execution exception: " + ex.Message); } var clog = reporter.CreateAndSaveError(ex, wRecord, this, errorType); wRecord.log(clog.Message); // crawlerErrorLog cel = new crawlerErrorLog(ex, wRecord, this, errorType); } finally { } aceLog.consoleControl.removeFromOutput(wRecord); //, "sp:" + tRecord.instance.name); }
public void processLoaderResult(spiderTaskResult sResult, modelSpiderSiteRecord wRecord, crawlerDomainTask wTask) => processUniversalCall(crawlerDomainTaskIterationPhase.receiveResult, wRecord, wTask, sResult);
/// <summary> /// Processes the universal call. /// </summary> /// <param name="phase">The phase.</param> /// <param name="wRecord">The w record.</param> /// <param name="wTask">The w task.</param> /// <param name="sResult">The s result.</param> /// <param name="tRecord">The t record.</param> /// <param name="dataUnit">The data unit.</param> protected void processUniversalCall(crawlerDomainTaskIterationPhase phase, modelSpiderSiteRecord wRecord, crawlerDomainTask wTask, spiderTaskResult sResult = null, modelSpiderTestRecord tRecord = null, dataUnitSpiderIteration dataUnit = null) { if (!IsEnabled) { return; } if (plugins[phase].Any(x => x.IsEnabled)) { foreach (ISpiderPlugIn plug in plugins[phase]) { try { switch (phase) { case crawlerDomainTaskIterationPhase.applyLinkRules: if (plug is ISpiderPlugInForContent) { ((ISpiderPlugInForContent)plug).processAfterResultReceived(wRecord, wTask); } break; case crawlerDomainTaskIterationPhase.checkingRules: break; case crawlerDomainTaskIterationPhase.getLoadTask: if (plug is ISpiderPlugInForContentPostprocess) { ((ISpiderPlugInForContentPostprocess)plug).processEndOfIteration(wRecord, wTask); } break; case crawlerDomainTaskIterationPhase.iterationProcessFinished: break; case crawlerDomainTaskIterationPhase.iterationProcessInit: break; case crawlerDomainTaskIterationPhase.iterationProcessNotStarted: break; case crawlerDomainTaskIterationPhase.iterationStart: break; case crawlerDomainTaskIterationPhase.loadingSeedPage: break; case crawlerDomainTaskIterationPhase.loadingTask: break; case crawlerDomainTaskIterationPhase.none: break; case crawlerDomainTaskIterationPhase.pageEvaluation: if (plug is ISpiderPlugInForContentPostprocess) { ((ISpiderPlugInForContentPostprocess)plug).processAtDLCFinished(wRecord, wTask); } break; case crawlerDomainTaskIterationPhase.receiveResult: if (plug is ISpiderPlugInForContent) { ((ISpiderPlugInForContent)plug).processLoaderResult(sResult, wRecord, wTask); } break; case crawlerDomainTaskIterationPhase.updatingData: break; } //if (plug is ISpiderPlugInForContent) ((ISpiderPlugInForContent)plug).processAfterResultReceived(wRecord, wTask); } catch (Exception ex) { aceLog.log("Index Plugin [" + plug.name + "]:" + plug.GetType().Name + " at " + phase.ToString() + " execution crashed: " + ex.Message); crawlerErrorLog cel = new crawlerErrorLog(ex, wRecord, wTask, crawlerErrorEnum.crawlerPlugin); cel.SaveXML(); } } } }
/// <summary> /// Deploys information from wRecord, including the key /// </summary> /// <param name="wRecord">The w record.</param> public void deploy(modelSpiderSiteRecord wRecord) { double i_lm_harvest = 0; double i_lm_recall = 0; double i_pi_harvest = 0; double i_pi_nominal = 0; dataUnitSpiderIteration spi_first = wRecord.timeseries.GetData().FirstOrDefault() as dataUnitSpiderIteration; dataUnitSpiderIteration spi_last = wRecord.timeseries.lastEntry as dataUnitSpiderIteration; dataUnitSpiderIteration spi_current = wRecord.timeseries.currentEntry as dataUnitSpiderIteration; if (spi_current != null) { time_duration_s = creationTime.Subtract(spi_current.rowCreated).TotalSeconds; } else { time_duration_s = 0; } // if (spi_last != null) time_duration_gross_s = creationTime.Subtract(spi_last.rowCreated).TotalSeconds; else time_duration_gross_s = 0; if (spi_first != null) { time_sincefirst_s = creationTime.Subtract(spi_first.rowCreated).TotalSeconds; } else { time_sincefirst_s = 0; } indexDomain idomain = wRecord.GetIndexInfo(); // imbWEMManager.index.domainIndexTable.GetDomain(wRecord.domainInfo.domainName); iteration = wRecord.iteration; blocks_all = wRecord.context.targets.blocks.Count(false); blocks_relevant = wRecord.context.targets.blocks.Count(true); terms_all = wRecord.context.targets.termsAll.Count(); terms_relevant = wRecord.context.targets.termSerbian.Count(); var TFIDF = wRecord.MasterTFIDF; // imbWEMManager.index.experimentEntry.globalTFIDFCompiled; var mchs = TFIDF.GetMatches(wRecord.context.targets.termSerbian); //TFIDF.GetScoreAggregate() key = wRecord.domainInfo.domainName + iteration.ToString("D3"); int relCount = 0; int irelCount = 0; int lCount = 0; int rCount = 0; int dCount = 0; double fraDuration = 0; int modulesContained = 0; int rec = 0; foreach (frontierRankingAlgorithmIterationRecord gen in wRecord.frontierDLC.generalRecords) { rec++; fraDuration += gen.duration; } FRA_SummaryRuntime = fraDuration.GetRatio((double)rec); FRA_TimePercent = FRA_SummaryRuntime.GetRatio(time_duration_s); var rtake = wRecord.tRecord.measureTaker.GetLastTake(); if (rtake != null) { CPU = rtake.cpuRateOfProcess; } if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { foreach (moduleDLCRecord mod in wRecord.frontierDLC) { if (mod != null) { modulesContained += mod.GetLastEntry().accumulated; } } } List <string> hashList = new List <string>(); List <spiderTarget> nonDuplicate = new List <spiderTarget>(); foreach (spiderTarget t in wRecord.context.targets.GetLoaded()) { indexPage ipage = t.GetIndexPage(); //imbWEMManager.index.pageIndexTable.GetPageForUrl(t.url); //i_pi_harvest += ipage.InfoPrize; if (ipage != null) { i_pi_nominal += ipage.InfoPrize; } bool isDuplicate = t.isDuplicate; if (isDuplicate) { if (!hashList.Contains(t.pageHash)) { hashList.Add(t.pageHash); isDuplicate = false; } } if (!isDuplicate) { if (t.IsRelevant) { relCount++; } else { irelCount++; } lCount++; nonDuplicate.Add(t); } else { dCount++; } } relevantPageCount = relCount; irrelevantPageCount = irelCount; loadedPageCount = lCount; duplicateCount = dCount; int mchs_c = 0; int id_lm_c = 0; if (idomain != null) { id_lm_c = idomain.Lemmas; } if (mchs != null) { mchs_c = mchs.Count(); } i_lm_harvest = mchs_c.GetRatio(loadedPageCount); IP = TFIDF.GetScoreForMatch(wRecord.context.targets.termSerbian); i_lm_recall = mchs_c.GetRatio(id_lm_c); if (i_lm_recall > 1) { i_lm_recall = 1; } if (idomain != null) { IP_recall = i_pi_nominal.GetRatio(idomain.InfoPrize).ClipToK(); } if (idomain != null) { Term_recall = wRecord.context.targets.termSerbian.Count().GetRatio(idomain.Words).ClipToK(); } Page_recall = relevantPageCount.GetRatio(wRecord.pageRecallTarget).ClipToK(); i_pi_nominal = i_pi_nominal.GetRatio(loadedPageCount); i_pi_harvest = IP.GetRatio((double)lCount); spiderTaskResult lastResult = null; foreach (spiderTaskResult r in wRecord.spiderTaskResults) { lastResult = r; rCount = rCount + r.Count; } realLoadsCount = rCount; if (lastResult != null) { targetUrl = ""; targetLanguage = ""; targetEvalRatio = ""; foreach (spiderTaskResultItem item in lastResult.items.Values) { targetUrl = targetUrl.add(item.target.url, ","); var t = wRecord.context.targets.GetByTarget(item.target); if (t != null) { if (t.evaluation != null) { targetLanguage = targetLanguage.add(t.evaluatedLanguage.ToString(), ";"); targetEvalRatio = targetEvalRatio.add(t.evaluation.result_ratio.ToString(), ";"); } else { if (t.isDuplicate) { targetLanguage = targetLanguage.add("duplicate", ";"); targetEvalRatio = targetEvalRatio.add("duplicate", ";"); } else { targetLanguage = targetLanguage.add("unknown", ";"); targetEvalRatio = targetEvalRatio.add("unknown", ";"); } } } } } if ((relevantPageCount == 0) || (loadedPageCount == 0)) { E_PP = 0; } else { E_PP = (double)relevantPageCount / (double)loadedPageCount; } if ((wRecord.context.targets.termSerbian.Count == 0) || (wRecord.context.targets.termsAll.Count == 0) || (loadedPageCount == 0)) { E_TP = 0; E_TH = 0; } else { E_TP = (double)wRecord.context.targets.termSerbian.Count / (double)wRecord.context.targets.termsAll.Count; E_TH = (double)wRecord.context.targets.termSerbian.Count / (double)loadedPageCount; } IPnominal = i_pi_nominal; IP_collected = i_pi_harvest; Lm_collected = i_lm_harvest; Lm_recall = i_lm_recall; }