public void SetIgnore(IEnumerable <string> tokens) { ignoreFile.Append(tokens, false); foreach (string tkn in tokens) { if (tkn.isNonDosChars()) { ignoreFile.Append(tkn.toDosCleanDirect()); } } ignoreFile.Save(); }
private void reportTarget(spiderTarget t, folderNode fn, int c) { string pageFolder = "P" + c.ToString("D3") + "_" + t.IsRelevant.ToString(); folderNode pfn = fn.Add(pageFolder, "Page " + c.ToString(), "Report on page " + t.url + " crawled by " + name + ". Target.IsRelevant: " + t.IsRelevant + ".".addLine(pageDescription)); fileunit content = new fileunit(pfn.pathFor("content.txt"), false); fileunit links = new fileunit(pfn.pathFor("links.txt"), false); if (t.evaluation != null) { t.evaluation.saveObjectToXML(pfn.pathFor("relevance.xml")); } content.setContent(t.pageText); //t.page.relationship.outflowLinks if (t.page != null) { foreach (spiderLink ln in t.page.relationship.outflowLinks.items.Values) { string rl = ln.url; links.Append(ln.url); } //t.page.webpage.links.ForEach(x => links.Append(x.nature + " | " + x.name + " | " + x.url)); } content.Save(); links.Save(); // marks.Save(); }
public void reportIteration(dataUnitSpiderIteration dataUnit, modelSpiderSiteRecord wRecord, ISpiderEvaluatorBase evaluator) { iterationPerformanceRecord ip_record = new iterationPerformanceRecord(wRecord); wRecord.iterationTableRecord.Add(ip_record); folderNode fn; //siteRecords[wRecord].Add(dataUnit.iteration.ToString("D3"), wRecord.domainInfo.domainRootName + dataUnit.iteration.ToString("D3"), "Iteration " + dataUnit.iteration + " on domain: " + wRecord.domainInfo.domainName); if (imbWEMManager.settings.directReportEngine.doIterationReport) { if (imbWEMManager.settings.directReportEngine.doDomainReport) { fn = getIterationFolder(dataUnit.iteration, wRecord); if (REPORT_WRECORD_LOG) { wRecord.logBuilder.getLastLine().saveStringToFile(fn.pathFor("wrecord.txt")); } string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath(); textByIteration url_loaded = urlsLoaded[wRecord]; //.GetOrAdd(wRecord, new textByIteration()); textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration()); //textByIteration terms_ext = termsExtracted[wRecord]; //textByIteration sentence_ext = sentencesExtracted[wRecord]; if (REPORT_MODULES) { if (imbWEMManager.settings.directReportEngine.DR_ReportModules_XMLIteration) { if (wRecord.tRecord.instance is spiderModularEvaluatorBase) { wRecord.frontierDLC.reportIterationOut(wRecord, fn); } } } string its = dataUnit.iteration.ToString("D3"); //DataTable dt = wRecord.context.targets.GetDataTable(); //dt.SetTitle(fileprefix + "_targets"); //dt.serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.csv, "", fn, notation); //sentence_ext[dataUnit.iteration].AddRangeUnique(wRecord.context.targets.blocks.GetHashList()); //if (REPORT_ITERATION_TERMS) //{ // fileunit blocks = new fileunit(fn.pathFor(its + "_blc.txt"), false); // blocks.setContentLines(sentence_ext[dataUnit.iteration]); // blocks.Save(); //} if (REPORT_TIMELINE) { objectSerialization.saveObjectToXML(ip_record, fn.pathFor("performance.xml")); } if (REPORT_ITERATION_URLS) { if (wRecord.iteration > 0) { builderForMarkdown now_loaded = new builderForMarkdown(); //fileunit now_loaded = new fileunit(fn.pathFor(its + "_loadedNow.txt"), false); List <spiderTarget> targets_loaded = wRecord.context.targets.GetLoadedInIteration(wRecord.iteration - 1); int tc = 0; foreach (spiderTarget t in targets_loaded) { reportTarget(t, fn, tc); now_loaded.AppendLine(t.url); now_loaded.AppendHorizontalLine(); now_loaded.Append(t.marks.GetActiveResults()); now_loaded.AppendHorizontalLine(); now_loaded.Append(t.marks.GetPassiveResults()); now_loaded.AppendHorizontalLine(); var dt = t.marks.getHistory(t.url, wRecord.tRecord.instance.name); dt.Save(fn, imbWEMManager.authorNotation, its + "_loadedNow"); now_loaded.AppendTable(dt, false); tc++; } now_loaded.ToString().saveStringToFile(fn.pathFor(its + "_loadedNow.txt")); spiderTaskResult loadResults = wRecord.spiderTaskResults[wRecord.iteration - 1]; loadResults.getDataTable().GetReportAndSave(fn, notation, "loadResults", true); // .serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.excel, "loadResults", fn, notation); } fileunit detected = new fileunit(fn.pathFor(its + "_dt.txt"), false); fileunit loaded = new fileunit(fn.pathFor(its + "_ld.txt"), false); fileunit relp = new fileunit(fn.pathFor(its + "_srb_ld.txt"), false); relp.Append(wRecord.relevantPages, true); foreach (spiderTarget t in wRecord.context.targets) { if (t.page != null) { //t.contentBlocks.ForEach(x => sentence_ext[dataUnit.iteration].AddUnique(x.textHash)); loaded.Append(t.url); url_loaded[dataUnit.iteration].Add(t.url); } else { detected.Append(t.url); url_detected[dataUnit.iteration].Add(t.url); } } string lineFormat = "{0,5} {1,30} [s:{1,6}]" + Environment.NewLine; fileunit active = new fileunit(fn.pathFor(its + "_act.txt"), false); int c = 1; foreach (var lnk in wRecord.web.webActiveLinks) { active.Append(string.Format(lineFormat, c, lnk.url, lnk.marks.score)); active.Append(lnk.marks.GetLayerAssociation()); c++; } detected.Save(); loaded.Save(); active.Save(); } } } wRecord.tRecord.instance.reportIteration(this, wRecord); }