public override void eventPluginInstalled() { session = imbWEMManager.index.experimentEntry; aceLog.consoleControl.setAsOutput(loger, "IndexDB:" + session.SessionID); settings = imbWEMManager.settings.indexEngine; settings.doIndexUpdateOnDLC = false; settings.doIndexFullTrustMode = false; imbWEMManager.settings.directReportEngine.doPublishPerformance = false; //globalTFIDFConstruct = __session.GetTFIDF_MasterConstruct(); //new webSitePageTFSet(__spider.SessionID); // globalTFIDFCompiled = __session.GetTFIDF_Master(); // new webSiteLemmaTFSetObjectTable(__session.indexSubFolder.pathFor(experimentSessionEntry.PATH_CompiledFTIDF), true, __session.SessionID); // domainTF_IDF = new aceConcurrentDictionary<weightTableCompiled>(); //if (globalTFIDFCompiled.Count > 0) //{ // loger.log("TF-IDF compiled version found on: " + globalTFIDFCompiled.info.FullName); //} // domainAssertion = imbWEMManager.index.domainIndexTable.GetDomainIndexAssertion(null, true); evaluator = new multiLanguageEvaluator(basicLanguageEnum.english, basicLanguageEnum.serbian, basicLanguageEnum.serbianCyr); evaluator.testTokenLimit = 5000; evaluator.tokenLengthMin = 3; evaluator.validTokenTarget = 2500; }
/// <summary> /// Performs full domain reevaluation /// </summary> /// <param name="settings">The settings.</param> /// <param name="loger">The loger.</param> /// <param name="__wRecord">The w record.</param> /// <param name="evaluator">The evaluator.</param> public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF) { indexDomain idomain = null; //lock (updateIndexLockD) //{ idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); // } idomain.url = __wRecord.domain; //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true); double dIP = 0; int p = 0; List <string> dTerms = new List <string>(); List <string> dDistinctTerms = new List <string>(); List <string> dLemmas = new List <string>(); List <string> dWords = new List <string>(); List <string> urls = new List <string>(); bool doEvalD = true; foreach (spiderTarget target in __wRecord.context.targets.GetLoaded()) { indexPage ipage = null; // lock (updateIndexLock) // { ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain); // } bool doEval = true; int dLc = 0; if (settings.plugIn_indexDBUpdater_optimizedMode) { if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant))) { doEval = false; if (ipage.AllWords.isNullOrEmpty()) { doEval = true; } if (ipage.AllLemmas.isNullOrEmpty()) { doEval = true; } } } if (doEval) { List <string> terms = new List <string>(); if (ipage.AllWords.isNullOrEmpty()) { terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); } else { terms = ipage.AllWords.SplitSmart(",", "", true); } ipage.AllWords = terms.toCsvInLine(); double IP = 0; List <string> lemmas = new List <string>(); List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms); if (ipage.AllLemmas.isNullOrEmpty()) { // terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); lemmas.AddRange(mchl.Select(x => x.nominalForm)); } else { lemmas = ipage.AllLemmas.SplitSmart(",", "", true); } foreach (weightTableTermCompiled cterm in mchl) { IP += cterm.tf_idf; //dTerms.AddUnique(cterm.nominalForm); if (cterm.df == 1) { dDistinctTerms.AddUnique(cterm.nominalForm); } } ipage.InfoPrize = IP; dIP += IP; ipage.Lemmas = lemmas.Count; ipage.AllLemmas = lemmas.toCsvInLine(); dWords.AddRange(terms); dLemmas.AddRange(lemmas); ipage.Note = "indexUpdate" + SessionID; // lock (updateIndexLockB) // { imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage); // } // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } else { dIP += ipage.InfoPrize; doEvalD = false; // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", " ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } urls.Add(ipage.url); p++; loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); target.Dispose(); } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { if (!doEvalD) { var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain); int dlc_c = dlc_tf.Count; idomain.TFIDFcompiled = (dlc_c > 0); idomain.Lemmas = dlc_c; } else { idomain.Lemmas = dLemmas.Count; idomain.Words = dWords.Count; idomain.TFIDFcompiled = (dLemmas.Count > 0); idomain.DistinctLemmas = dDistinctTerms.toCsvInLine(); idomain.AllLemmas = dLemmas.toCsvInLine(); idomain.AllWords = dWords.toCsvInLine(); } idomain.InfoPrize = dIP; //if (doEvalD) var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls); idomain.relevantPages = urlAssert[indexPageEvaluationEntryState.isRelevant].Count; idomain.notRelevantPages = urlAssert[indexPageEvaluationEntryState.notRelevant].Count; idomain.detected = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count; idomain.Crawled = urlAssert.certainty; idomain.RelevantContentRatio = urlAssert.relevant; string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7")); if (loger != null) { loger.AppendLine(rpp); } } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); } imbWEMManager.index.wRecordsDeployed++; __wRecord.Dispose(); }