/// <summary> /// Runs when a DLC is finished /// </summary> /// <param name="wRecord">The w record.</param> public void reportDomainFinished(modelSpiderSiteRecord wRecord) { folderNode fn = null; string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath(); if (imbWEMManager.settings.directReportEngine.doDomainReport) { fn = folder[DRFolderEnum.sites].Add(wRecord.domainInfo.domainRootName.getCleanFilepath(), "Report on " + wRecord.domainInfo.domainName, "Records on domain " + wRecord.domainInfo.domainName + " crawled by " + name); if (REPORT_DOMAIN_TERMS) { if (wRecord.tRecord.instance.settings.doEnableDLC_TFIDF) { if (wRecord.context.targets.dlTargetPageTokens != null) { wRecord.context.targets.dlTargetPageTokens.GetDataSet(true).serializeDataSet("token_ptkn", fn, dataTableExportEnum.excel, notation); } } if (wRecord.context.targets.dlTargetLinkTokens != null) { wRecord.context.targets.dlTargetLinkTokens.GetDataSet(true).serializeDataSet("token_ltkn", fn, dataTableExportEnum.excel, notation); } } if (REPORT_DOMAIN_PAGES) { int c = 1; foreach (spiderTarget t in wRecord.context.targets.GetLoadedInOrderOfLoad()) { reportTarget(t, fn, c); c++; } } fileunit wLog = new fileunit(folder[DRFolderEnum.logs].pathFor(fileprefix + ".txt"), false); wLog.setContent(wRecord.logBuilder.ContentToString(true)); wLog.Save(); if (REPORT_ITERATION_URLS) { textByIteration url_loaded = urlsLoaded[wRecord]; //.GetOrAdd(wRecord, new textByIteration()); textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration()); fileunit url_ld_out = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_ld.txt"), false); fileunit url_dt_out = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_dt.txt"), false); fileunit url_srb_out = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_srb_ld.txt"), false); url_ld_out.setContentLines(url_loaded.GetAllUnique()); url_dt_out.setContentLines(url_detected.GetAllUnique()); url_srb_out.setContentLines(wRecord.relevantPages); url_ld_out.Save(); url_dt_out.Save(); url_srb_out.Save(); } //terms_out.Save(); //sentence_out.Save(); } if (REPORT_MODULES) { if (wRecord.tRecord.instance is spiderModularEvaluatorBase) { wRecord.frontierDLC.reportDomainOut(wRecord, fn, fileprefix); } } if (REPORT_TIMELINE) { wRecord.iterationTableRecord.GetDataTable(null, "iteration_performace_" + fileprefix).GetReportAndSave(folder[DRFolderEnum.it], notation, "iteration_performace_" + fileprefix); //, notation); } //if (REPORT_TIMELINE) //{ // DataTable dt = wRecord.GetTimeSeriesPerformance(); // timeSeries.Add(dt); // dt.GetReportAndSave(folder[DRFolderEnum.it], notation, "iteration_frontier_stats_" + fileprefix); //} wRecord.tRecord.lastDomainIterationTable.Add(wRecord.iterationTableRecord.GetLastEntryTouched()); wRecord.tRecord.instance.reportDomainFinished(this, wRecord); wRecord.Dispose(); }
/// <summary> /// Performs full domain reevaluation /// </summary> /// <param name="settings">The settings.</param> /// <param name="loger">The loger.</param> /// <param name="__wRecord">The w record.</param> /// <param name="evaluator">The evaluator.</param> public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF) { indexDomain idomain = null; //lock (updateIndexLockD) //{ idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); // } idomain.url = __wRecord.domain; //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true); double dIP = 0; int p = 0; List <string> dTerms = new List <string>(); List <string> dDistinctTerms = new List <string>(); List <string> dLemmas = new List <string>(); List <string> dWords = new List <string>(); List <string> urls = new List <string>(); bool doEvalD = true; foreach (spiderTarget target in __wRecord.context.targets.GetLoaded()) { indexPage ipage = null; // lock (updateIndexLock) // { ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain); // } bool doEval = true; int dLc = 0; if (settings.plugIn_indexDBUpdater_optimizedMode) { if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant))) { doEval = false; if (ipage.AllWords.isNullOrEmpty()) { doEval = true; } if (ipage.AllLemmas.isNullOrEmpty()) { doEval = true; } } } if (doEval) { List <string> terms = new List <string>(); if (ipage.AllWords.isNullOrEmpty()) { terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); } else { terms = ipage.AllWords.SplitSmart(",", "", true); } ipage.AllWords = terms.toCsvInLine(); double IP = 0; List <string> lemmas = new List <string>(); List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms); if (ipage.AllLemmas.isNullOrEmpty()) { // terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); lemmas.AddRange(mchl.Select(x => x.nominalForm)); } else { lemmas = ipage.AllLemmas.SplitSmart(",", "", true); } foreach (weightTableTermCompiled cterm in mchl) { IP += cterm.tf_idf; //dTerms.AddUnique(cterm.nominalForm); if (cterm.df == 1) { dDistinctTerms.AddUnique(cterm.nominalForm); } } ipage.InfoPrize = IP; dIP += IP; ipage.Lemmas = lemmas.Count; ipage.AllLemmas = lemmas.toCsvInLine(); dWords.AddRange(terms); dLemmas.AddRange(lemmas); ipage.Note = "indexUpdate" + SessionID; // lock (updateIndexLockB) // { imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage); // } // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } else { dIP += ipage.InfoPrize; doEvalD = false; // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", " ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } urls.Add(ipage.url); p++; loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); target.Dispose(); } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { if (!doEvalD) { var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain); int dlc_c = dlc_tf.Count; idomain.TFIDFcompiled = (dlc_c > 0); idomain.Lemmas = dlc_c; } else { idomain.Lemmas = dLemmas.Count; idomain.Words = dWords.Count; idomain.TFIDFcompiled = (dLemmas.Count > 0); idomain.DistinctLemmas = dDistinctTerms.toCsvInLine(); idomain.AllLemmas = dLemmas.toCsvInLine(); idomain.AllWords = dWords.toCsvInLine(); } idomain.InfoPrize = dIP; //if (doEvalD) var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls); idomain.relevantPages = urlAssert[indexPageEvaluationEntryState.isRelevant].Count; idomain.notRelevantPages = urlAssert[indexPageEvaluationEntryState.notRelevant].Count; idomain.detected = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count; idomain.Crawled = urlAssert.certainty; idomain.RelevantContentRatio = urlAssert.relevant; string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7")); if (loger != null) { loger.AppendLine(rpp); } } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); } imbWEMManager.index.wRecordsDeployed++; __wRecord.Dispose(); }