/// <summary>Recalculating time by importing dt_dataLoad exported table and updating performance exports for each crawl</summary> /// <remarks><para>It will load the results record for opened session to find all crawls, and import all dt_dataLoad Excel tables to sum sampling periods</para></remarks> /// <seealso cref="aceOperationSetExecutorBase"/> public void aceOperation_repairRecoverTime() { var __recordKeyProperty = "TestID"; var homeFolder = new folderNode("index\\benchmark", "Home folder of plugin: benchmark ", "Internal data for pluting benchmark"); var recordName = imbWEMManager.index.experimentEntry.SessionID; var records = new objectTable <reportPlugIn_benchmarkResults>(homeFolder.pathFor(recordName.add("results", "_")), true, __recordKeyProperty, name); records.description = "Summary report on the most relevant evaluation metrics."; var record_performances = new objectTable <performanceRecord>(homeFolder.pathFor(recordName.add("performances", "_")), true, "TestID", name); var record_moduleImpact = new objectTable <moduleFinalOverview>(homeFolder.pathFor(recordName.add("modules", "_")), true, "ModuleName", name); // <---- making crawler list List <string> crawlerList = new List <string>(); List <reportPlugIn_benchmarkResults> allRecords = records.GetList(); var reportFolder = imbWEMManager.index.experimentEntry.sessionReportFolder; Dictionary <string, string> pathsForResultExcel = new Dictionary <string, string>(); Dictionary <string, folderNode> foldersForResultExcel = new Dictionary <string, folderNode>(); foreach (reportPlugIn_benchmarkResults result in allRecords) { crawlerList.Add(result.Crawler); output.AppendLine("Crawl found: " + result.Crawler); string pathCrawlerId = result.Crawler.Replace("-", ""); folderNode resultNode = reportFolder[pathCrawlerId.ToUpper() + "\\crawler\\data"]; string pathForData = resultNode.pathFor("dc_dataload_" + result.Crawler.ToLower() + ".csv"); //String pathForResult = reportFolder.pathFor(pathCrawlerId); foldersForResultExcel.Add(result.Crawler, resultNode); //foldersForResultExcel.Add(result.Crawler, pathForResult); // String path = reportFolder.pathFor(pathForData, getWritableFileMode.existing); output.AppendLine("Loading datatable: " + pathForData); DataTable dataTable = null; dataTable = pathForData.deserializeDataTable(dataTableExportEnum.csv); output.AppendLine("DataTable loaded - rows[" + dataTable.Rows.Count + "]"); DataColumn periodColumn = dataTable.Columns["Period"]; double periodSum = 0; foreach (DataRow dr in dataTable.Rows) { string read = dr[periodColumn].ToString().Replace(",", "."); double readD = double.Parse(read); periodSum += readD; } output.AppendLine("Total execution time in seconds: " + periodSum.ToString("F5")); result.CrawlTime = periodSum / ((double)60); records.AddOrUpdate(result); } foreach (reportPlugIn_benchmarkResults result in allRecords) { folderNode resultFolder = foldersForResultExcel[result.Crawler]; records.GetDataTable().GetReportAndSave(resultFolder, imbWEMManager.authorNotation, "results_timefix", true); output.AppendLine("Repaired result table saved to: " + resultFolder.path); // <---- fixing crawler results } }
public override void eventAtInitiationOfCrawlJob(crawlerDomainTaskMachine __machine, modelSpiderTestRecord tRecord) { imbWEMManager.index.domainIndexTable.recheck(imbWEMManager.index.pageIndexTable, output); //reportFolder = imbWEMManager.index.experimentEntry.sessionReportFolder; //String recordName = imbWEMManager.index.experimentEntry.SessionID.getFilename(); string path = imbWEMManager.index.experimentEntry.sessionReportFolder.pathFor(recordFileName); records = new objectTable <indexDomain>(path, true, __recordKeyProperty, name); records.description = "Side index"; var domains = records.GetList(); List <string> __url = new List <string>(); // http://www. Dictionary <string, indexDomain> dict = new Dictionary <string, indexDomain>(); domains.ForEach(x => __url.Add(x.url)); domains.ForEach(x => dict.Add(x.url, x)); int dc_ik = 0; List <crawlerDomainTask> tasks = new List <crawlerDomainTask>(); foreach (var task in __machine.items.items) { if (Enumerable.Any(__url, x => x == task.wRecord.instanceID)) // wRecord.instanceID = http://www. { task.status = crawlerDomainTaskStatusEnum.aborted; tasks.Add(task); } else { if (imbWEMManager.settings.supportEngine.reportPlugIn_sideIndexer_UseIfPagesKnown) { indexDomain iDomainFromIndex = imbWEMManager.index.domainIndexTable.GetOrCreate(task.wRecord.instanceID); records.AddOrUpdate(iDomainFromIndex, objectTableUpdatePolicy.updateIfHigher); if (dict.ContainsKey(task.wRecord.instanceID)) { indexDomain iDomain = dict[task.wRecord.instanceID]; if ((iDomain.relevantPages + iDomain.notRelevantPages) >= tRecord.instance.settings.limitTotalPageLoad) { dc_ik++; tasks.Add(task); } } } } } foreach (var task in __machine.items.items) { } int dc = 0; foreach (var task in tasks) { crawlerDomainTask t_out = null; if (__machine.items.items.TryDequeue(out t_out)) { dc++; } } aceLog.consoleControl.setAsOutput(output, "SideIndex"); if (dc > 0) { output.log("DLCs processed in an earlier session: " + dc); } if (dc_ik > 0) { output.log("DLCs removed from schedule because the index has already enough pages loaded: " + dc_ik); } }