public fileunit toNewOutput(List <String> content, String name, String extension = "txt") { fileunit funit = getNewOutput(name, extension); funit.setContentLines(content); return(funit); }
/// <summary> /// Loads the table from the specified filepath /// </summary> /// <param name="filepath">The filepath.</param> public void Load(string filepath, ILogBuilder loger) { sourceFile = new fileunit(filepath, true); int cl = sourceFile.contentLines.Count; loger.log("Lexicon terms coding twins definitions: " + cl); int lind = 0; int lmax = cl / 20; lind = lmax; LoadCount = 0; foreach (string ln in sourceFile.contentLines) { lind--; SetEntryFromString(ln); LoadCount++; if (lind <= 0) { lind = lmax; loger.log("Coding twins loaded: " + LoadCount); } } loger.log("Coding twins completly loaded: " + LoadCount); }
/// <summary> /// Saves the output. /// </summary> /// <param name="file">The file.</param> public void saveOutput(fileunit file) { String filename = Path.GetFileName(file.path); String path = folder[aceCCFolders.output].pathFor(filename); file.getContent().saveStringToFile(path, getWritableFileMode.autoRenameExistingOnOtherDate); }
private void reportTarget(spiderTarget t, folderNode fn, int c) { string pageFolder = "P" + c.ToString("D3") + "_" + t.IsRelevant.ToString(); folderNode pfn = fn.Add(pageFolder, "Page " + c.ToString(), "Report on page " + t.url + " crawled by " + name + ". Target.IsRelevant: " + t.IsRelevant + ".".addLine(pageDescription)); fileunit content = new fileunit(pfn.pathFor("content.txt"), false); fileunit links = new fileunit(pfn.pathFor("links.txt"), false); if (t.evaluation != null) { t.evaluation.saveObjectToXML(pfn.pathFor("relevance.xml")); } content.setContent(t.pageText); //t.page.relationship.outflowLinks if (t.page != null) { foreach (spiderLink ln in t.page.relationship.outflowLinks.items.Values) { string rl = ln.url; links.Append(ln.url); } //t.page.webpage.links.ForEach(x => links.Append(x.nature + " | " + x.name + " | " + x.url)); } content.Save(); links.Save(); // marks.Save(); }
public void saveOutput(List <string> content, string path) { fileunit tLog = new fileunit(path, false); tLog.setContentLines(content); tLog.Save(); }
public fileunit loadInput(String filename) { String path = folder[aceCCFolders.input].pathFor(filename); var unit = new fileunit(path); loadedInputs.AddUnique(unit.info.FullName); return(unit); }
public void GenerateCumulative(int iterations, folderNode folder, string sufix, string prefix = "IT") { for (int i = 0; i < iterations; i++) { fileunit iteration = new fileunit(folder.pathFor(prefix + i.ToString("D3").add(sufix, "_").ensureEndsWith(".txt")), false); foreach (var p in items) { iteration.AppendUnique(p.Value[i]); } iteration.Save(); } }
public morphMachine(folderNode node) { folder = node; string file = folder.findFile("lexiconCache_negatives.txt", SearchOption.AllDirectories, false); inputFile = new fileunit(file, true); string ignorePath = folder.pathFor("morphMachine_ignore.txt"); ignoreFile = new fileunit(ignorePath, true); SetRuleSets(); }
#pragma warning disable CS1574 // XML comment has cref attribute 'dataException' that could not be resolved /// <summary> /// Appends the file with the entries of the table /// </summary> /// <param name="filepath">The filepath.</param> /// <exception cref="aceCommonTypes.core.exceptions.dataException">The source file was never defined - cant use save without filepath - null - Save() failed, no filepath</exception> public void Append(string filepath = null) #pragma warning restore CS1574 // XML comment has cref attribute 'dataException' that could not be resolved { if (sourceFile == null) { if (imbSciStringExtensions.isNullOrEmptyString(filepath)) throw new dataException("The source file was never defined - cant use save without filepath", null, this, "Save() failed, no filepath"); sourceFile = new fileunit(filepath, true); } List<string> entryList = GetEntriesAsStringPairs(); sourceFile.AppendUnique(entryList); sourceFile.Save(); }
/// <summary> /// Prepares the specified loger. /// </summary> /// <param name="loger">The loger.</param> public void prepare(ILogBuilder loger, folderNode __folder) { folder = __folder; failList = new fileunit(folder.pathFor(FILE_FAILLIST), true); domainFailList = new fileunit(folder.pathFor(FILE_DOMAINFAILLIST), true); duplicateList = new fileunit(folder.pathFor(FILE_DUPLICATE), true); foreach (string ln in duplicateList.contentLines) { var sp = ln.Split(new string[] { "|||" }, StringSplitOptions.RemoveEmptyEntries); duplicates.TryAdd(sp[0], sp[1]); } }
protected void LoadCacheFiles(ILogBuilder loger, semanticLexiconContext context) { failedQueries = new fileunit(folder.pathFor("lexiconCache_negatives.txt"), true); loger.log("Negative queries loaded"); AddTemp(failedQueries.contentLines, loger, true, true); loger.log("Loading encoding twins"); twins.Load(twinsSavePath, loger); loger.log("Encoding twins loaded"); if (twins.Count == 0) { rebuildEncodedTwins(loger, context); } failedQueries.Save(); }
public void reportCrawler(modelSpiderTestRecord tRecord) { folderNode fn = folder[DRFolderEnum.crawler]; string fileprefix = tRecord.instance.name.getCleanFilePath(); //tRecord.name.getCleanFilepath(); if (REPORT_TIMELINE) { DataTable timeline = timeSeries.GetAggregatedTable("frontier_stats", dataPointAggregationAspect.overlapMultiTable); //.GetSumTable("timeline_" + fileprefix.Replace(" ", "")); timeline.GetReportAndSave(folder[DRFolderEnum.crawler], notation, "frontier_stats" + fileprefix); } if (REPORT_ITERATION_URLS) { tRecord.allUrls = urlsLoaded.GetAllUnique(); tRecord.allDetectedUrls = urlsDetected.GetAllUnique(); saveOutput(tRecord.allDetectedUrls, folder[DRFolderEnum.crawler].pathFor("urls_detected.txt")); saveOutput(tRecord.allUrls, folder[DRFolderEnum.crawler].pathFor("urls_loaded.txt")); saveOutput(tRecord.relevantPages, folder[DRFolderEnum.crawler].pathFor("urls_relevant_loaded.txt")); } // Int32 iterations = tRecord.instance.settings.limitIterations; DataTable cpuTable = tRecord.cpuTaker.GetDataTableBase("cpuMetrics").GetReportAndSave(folder[DRFolderEnum.crawler], notation, "cpu_" + fileprefix); DataTable dataTable = tRecord.dataLoadTaker.GetDataTableBase("dataLoadMetrics").GetReportAndSave(folder[DRFolderEnum.crawler], notation, "dataload_" + fileprefix); DataTable resourcesTable = tRecord.measureTaker.GetDataTableBase("resourceMetrics").GetReportAndSave(folder[DRFolderEnum.crawler], notation, "resource_" + fileprefix); if (imbWEMManager.settings.directReportEngine.doPublishPerformance) { tRecord.performance.folderName = folder.name; tRecord.performance.deploy(tRecord); tRecord.performance.saveObjectToXML(folder[DRFolderEnum.crawler].pathFor("performance.xml")); DataTable pTable = tRecord.performance.GetDataTable(true).GetReportAndSave(folder, notation, "crawler_performance" + fileprefix); } tRecord.lastDomainIterationTable.GetDataTable(null, imbWEMManager.index.experimentEntry.CrawlID).GetReportAndSave(folder, notation, "DLCs_performance_" + fileprefix); tRecord.reporter = this; signature.deployReport(tRecord); //signature.notation = notation; signature.saveObjectToXML(folder.pathFor("signature.xml")); folder.generateReadmeFiles(notation); fileunit tLog = new fileunit(folder[DRFolderEnum.logs].pathFor(fileprefix + ".txt"), false); tLog.setContent(tRecord.logBuilder.ContentToString(true)); tLog.Save(); tRecord.instance.reportCrawlFinished(this, tRecord); aceLog.consoleControl.setLogFileWriter(); }
/// <summary> /// Runs when a DLC is finished /// </summary> /// <param name="wRecord">The w record.</param> public void reportDomainFinished(modelSpiderSiteRecord wRecord) { folderNode fn = null; string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath(); if (imbWEMManager.settings.directReportEngine.doDomainReport) { fn = folder[DRFolderEnum.sites].Add(wRecord.domainInfo.domainRootName.getCleanFilepath(), "Report on " + wRecord.domainInfo.domainName, "Records on domain " + wRecord.domainInfo.domainName + " crawled by " + name); if (REPORT_DOMAIN_TERMS) { if (wRecord.tRecord.instance.settings.doEnableDLC_TFIDF) { if (wRecord.context.targets.dlTargetPageTokens != null) { wRecord.context.targets.dlTargetPageTokens.GetDataSet(true).serializeDataSet("token_ptkn", fn, dataTableExportEnum.excel, notation); } } if (wRecord.context.targets.dlTargetLinkTokens != null) { wRecord.context.targets.dlTargetLinkTokens.GetDataSet(true).serializeDataSet("token_ltkn", fn, dataTableExportEnum.excel, notation); } } if (REPORT_DOMAIN_PAGES) { int c = 1; foreach (spiderTarget t in wRecord.context.targets.GetLoadedInOrderOfLoad()) { reportTarget(t, fn, c); c++; } } fileunit wLog = new fileunit(folder[DRFolderEnum.logs].pathFor(fileprefix + ".txt"), false); wLog.setContent(wRecord.logBuilder.ContentToString(true)); wLog.Save(); if (REPORT_ITERATION_URLS) { textByIteration url_loaded = urlsLoaded[wRecord]; //.GetOrAdd(wRecord, new textByIteration()); textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration()); fileunit url_ld_out = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_ld.txt"), false); fileunit url_dt_out = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_dt.txt"), false); fileunit url_srb_out = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_srb_ld.txt"), false); url_ld_out.setContentLines(url_loaded.GetAllUnique()); url_dt_out.setContentLines(url_detected.GetAllUnique()); url_srb_out.setContentLines(wRecord.relevantPages); url_ld_out.Save(); url_dt_out.Save(); url_srb_out.Save(); } //terms_out.Save(); //sentence_out.Save(); } if (REPORT_MODULES) { if (wRecord.tRecord.instance is spiderModularEvaluatorBase) { wRecord.frontierDLC.reportDomainOut(wRecord, fn, fileprefix); } } if (REPORT_TIMELINE) { wRecord.iterationTableRecord.GetDataTable(null, "iteration_performace_" + fileprefix).GetReportAndSave(folder[DRFolderEnum.it], notation, "iteration_performace_" + fileprefix); //, notation); } //if (REPORT_TIMELINE) //{ // DataTable dt = wRecord.GetTimeSeriesPerformance(); // timeSeries.Add(dt); // dt.GetReportAndSave(folder[DRFolderEnum.it], notation, "iteration_frontier_stats_" + fileprefix); //} wRecord.tRecord.lastDomainIterationTable.Add(wRecord.iterationTableRecord.GetLastEntryTouched()); wRecord.tRecord.instance.reportDomainFinished(this, wRecord); wRecord.Dispose(); }
public void reportIteration(dataUnitSpiderIteration dataUnit, modelSpiderSiteRecord wRecord, ISpiderEvaluatorBase evaluator) { iterationPerformanceRecord ip_record = new iterationPerformanceRecord(wRecord); wRecord.iterationTableRecord.Add(ip_record); folderNode fn; //siteRecords[wRecord].Add(dataUnit.iteration.ToString("D3"), wRecord.domainInfo.domainRootName + dataUnit.iteration.ToString("D3"), "Iteration " + dataUnit.iteration + " on domain: " + wRecord.domainInfo.domainName); if (imbWEMManager.settings.directReportEngine.doIterationReport) { if (imbWEMManager.settings.directReportEngine.doDomainReport) { fn = getIterationFolder(dataUnit.iteration, wRecord); if (REPORT_WRECORD_LOG) { wRecord.logBuilder.getLastLine().saveStringToFile(fn.pathFor("wrecord.txt")); } string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath(); textByIteration url_loaded = urlsLoaded[wRecord]; //.GetOrAdd(wRecord, new textByIteration()); textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration()); //textByIteration terms_ext = termsExtracted[wRecord]; //textByIteration sentence_ext = sentencesExtracted[wRecord]; if (REPORT_MODULES) { if (imbWEMManager.settings.directReportEngine.DR_ReportModules_XMLIteration) { if (wRecord.tRecord.instance is spiderModularEvaluatorBase) { wRecord.frontierDLC.reportIterationOut(wRecord, fn); } } } string its = dataUnit.iteration.ToString("D3"); //DataTable dt = wRecord.context.targets.GetDataTable(); //dt.SetTitle(fileprefix + "_targets"); //dt.serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.csv, "", fn, notation); //sentence_ext[dataUnit.iteration].AddRangeUnique(wRecord.context.targets.blocks.GetHashList()); //if (REPORT_ITERATION_TERMS) //{ // fileunit blocks = new fileunit(fn.pathFor(its + "_blc.txt"), false); // blocks.setContentLines(sentence_ext[dataUnit.iteration]); // blocks.Save(); //} if (REPORT_TIMELINE) { objectSerialization.saveObjectToXML(ip_record, fn.pathFor("performance.xml")); } if (REPORT_ITERATION_URLS) { if (wRecord.iteration > 0) { builderForMarkdown now_loaded = new builderForMarkdown(); //fileunit now_loaded = new fileunit(fn.pathFor(its + "_loadedNow.txt"), false); List <spiderTarget> targets_loaded = wRecord.context.targets.GetLoadedInIteration(wRecord.iteration - 1); int tc = 0; foreach (spiderTarget t in targets_loaded) { reportTarget(t, fn, tc); now_loaded.AppendLine(t.url); now_loaded.AppendHorizontalLine(); now_loaded.Append(t.marks.GetActiveResults()); now_loaded.AppendHorizontalLine(); now_loaded.Append(t.marks.GetPassiveResults()); now_loaded.AppendHorizontalLine(); var dt = t.marks.getHistory(t.url, wRecord.tRecord.instance.name); dt.Save(fn, imbWEMManager.authorNotation, its + "_loadedNow"); now_loaded.AppendTable(dt, false); tc++; } now_loaded.ToString().saveStringToFile(fn.pathFor(its + "_loadedNow.txt")); spiderTaskResult loadResults = wRecord.spiderTaskResults[wRecord.iteration - 1]; loadResults.getDataTable().GetReportAndSave(fn, notation, "loadResults", true); // .serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.excel, "loadResults", fn, notation); } fileunit detected = new fileunit(fn.pathFor(its + "_dt.txt"), false); fileunit loaded = new fileunit(fn.pathFor(its + "_ld.txt"), false); fileunit relp = new fileunit(fn.pathFor(its + "_srb_ld.txt"), false); relp.Append(wRecord.relevantPages, true); foreach (spiderTarget t in wRecord.context.targets) { if (t.page != null) { //t.contentBlocks.ForEach(x => sentence_ext[dataUnit.iteration].AddUnique(x.textHash)); loaded.Append(t.url); url_loaded[dataUnit.iteration].Add(t.url); } else { detected.Append(t.url); url_detected[dataUnit.iteration].Add(t.url); } } string lineFormat = "{0,5} {1,30} [s:{1,6}]" + Environment.NewLine; fileunit active = new fileunit(fn.pathFor(its + "_act.txt"), false); int c = 1; foreach (var lnk in wRecord.web.webActiveLinks) { active.Append(string.Format(lineFormat, c, lnk.url, lnk.marks.score)); active.Append(lnk.marks.GetLayerAssociation()); c++; } detected.Save(); loaded.Save(); active.Save(); } } } wRecord.tRecord.instance.reportIteration(this, wRecord); }