public override void eventPluginInstalled() { session = imbWEMManager.index.experimentEntry; aceLog.consoleControl.setAsOutput(loger, "IndexDB:" + session.SessionID); settings = imbWEMManager.settings.indexEngine; settings.doIndexUpdateOnDLC = false; settings.doIndexFullTrustMode = false; imbWEMManager.settings.directReportEngine.doPublishPerformance = false; //globalTFIDFConstruct = __session.GetTFIDF_MasterConstruct(); //new webSitePageTFSet(__spider.SessionID); // globalTFIDFCompiled = __session.GetTFIDF_Master(); // new webSiteLemmaTFSetObjectTable(__session.indexSubFolder.pathFor(experimentSessionEntry.PATH_CompiledFTIDF), true, __session.SessionID); // domainTF_IDF = new aceConcurrentDictionary<weightTableCompiled>(); //if (globalTFIDFCompiled.Count > 0) //{ // loger.log("TF-IDF compiled version found on: " + globalTFIDFCompiled.info.FullName); //} // domainAssertion = imbWEMManager.index.domainIndexTable.GetDomainIndexAssertion(null, true); evaluator = new multiLanguageEvaluator(basicLanguageEnum.english, basicLanguageEnum.serbian, basicLanguageEnum.serbianCyr); evaluator.testTokenLimit = 5000; evaluator.tokenLengthMin = 3; evaluator.validTokenTarget = 2500; }
/// <summary> /// Initializes a new instance of the <see cref="pipelineTokenLanguageFilterNode"/> class. /// </summary> public pipelineTokenLanguageFilterNode(multiLanguageEvaluationTask evaluationSettings, List <basicLanguageEnum> __languages, basicLanguageEnum __primLanguage) { _nodeType = pipelineNodeTypeEnum.distributor; languages = __languages; languagePrimary = __primLanguage; mLanguageEval = new multiLanguageEvaluator(); mLanguageEval.setup(languages); settings = evaluationSettings; }
/// <summary> /// Initializes a new instance of the <see cref="pipelinePageLanguageFilterNode" /> class. /// </summary> /// <param name="evaluationSettings">The evaluation settings.</param> /// <param name="__languages">The languages.</param> /// <param name="__primLanguage">The prim language.</param> /// <param name="limitPageCount">The limit page count - it will only allow positivly evaluated pages to reach specified count.</param> public pipelinePageLanguageFilterNode(multiLanguageEvaluationTask evaluationSettings, List <basicLanguageEnum> __languages, basicLanguageEnum __primLanguage, Int32 limitPageCount) { _nodeType = pipelineNodeTypeEnum.distributor; languages = __languages; languagePrimary = __primLanguage; mLanguageEval = new multiLanguageEvaluator(); mLanguageEval.setup(languages); limitValidPageCount = limitPageCount; settings = evaluationSettings; }
/// <summary> /// Performs full domain reevaluation /// </summary> /// <param name="settings">The settings.</param> /// <param name="loger">The loger.</param> /// <param name="__wRecord">The w record.</param> /// <param name="evaluator">The evaluator.</param> public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF) { indexDomain idomain = null; //lock (updateIndexLockD) //{ idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); // } idomain.url = __wRecord.domain; //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true); double dIP = 0; int p = 0; List <string> dTerms = new List <string>(); List <string> dDistinctTerms = new List <string>(); List <string> dLemmas = new List <string>(); List <string> dWords = new List <string>(); List <string> urls = new List <string>(); bool doEvalD = true; foreach (spiderTarget target in __wRecord.context.targets.GetLoaded()) { indexPage ipage = null; // lock (updateIndexLock) // { ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain); // } bool doEval = true; int dLc = 0; if (settings.plugIn_indexDBUpdater_optimizedMode) { if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant))) { doEval = false; if (ipage.AllWords.isNullOrEmpty()) { doEval = true; } if (ipage.AllLemmas.isNullOrEmpty()) { doEval = true; } } } if (doEval) { List <string> terms = new List <string>(); if (ipage.AllWords.isNullOrEmpty()) { terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); } else { terms = ipage.AllWords.SplitSmart(",", "", true); } ipage.AllWords = terms.toCsvInLine(); double IP = 0; List <string> lemmas = new List <string>(); List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms); if (ipage.AllLemmas.isNullOrEmpty()) { // terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); lemmas.AddRange(mchl.Select(x => x.nominalForm)); } else { lemmas = ipage.AllLemmas.SplitSmart(",", "", true); } foreach (weightTableTermCompiled cterm in mchl) { IP += cterm.tf_idf; //dTerms.AddUnique(cterm.nominalForm); if (cterm.df == 1) { dDistinctTerms.AddUnique(cterm.nominalForm); } } ipage.InfoPrize = IP; dIP += IP; ipage.Lemmas = lemmas.Count; ipage.AllLemmas = lemmas.toCsvInLine(); dWords.AddRange(terms); dLemmas.AddRange(lemmas); ipage.Note = "indexUpdate" + SessionID; // lock (updateIndexLockB) // { imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage); // } // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } else { dIP += ipage.InfoPrize; doEvalD = false; // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", " ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } urls.Add(ipage.url); p++; loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); target.Dispose(); } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { if (!doEvalD) { var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain); int dlc_c = dlc_tf.Count; idomain.TFIDFcompiled = (dlc_c > 0); idomain.Lemmas = dlc_c; } else { idomain.Lemmas = dLemmas.Count; idomain.Words = dWords.Count; idomain.TFIDFcompiled = (dLemmas.Count > 0); idomain.DistinctLemmas = dDistinctTerms.toCsvInLine(); idomain.AllLemmas = dLemmas.toCsvInLine(); idomain.AllWords = dWords.toCsvInLine(); } idomain.InfoPrize = dIP; //if (doEvalD) var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls); idomain.relevantPages = urlAssert[indexPageEvaluationEntryState.isRelevant].Count; idomain.notRelevantPages = urlAssert[indexPageEvaluationEntryState.notRelevant].Count; idomain.detected = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count; idomain.Crawled = urlAssert.certainty; idomain.RelevantContentRatio = urlAssert.relevant; string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7")); if (loger != null) { loger.AppendLine(rpp); } } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); } imbWEMManager.index.wRecordsDeployed++; __wRecord.Dispose(); }
/// <summary> /// Gets the or create tfidf DLC. /// </summary> /// <param name="__wRecord">The w record.</param> /// <param name="loger">The loger.</param> /// <param name="__useExisting">if set to <c>true</c> [use existing].</param> /// <param name="__saveToCache">if set to <c>true</c> [save to cache].</param> /// <param name="evaluator">The evaluator.</param> /// <returns></returns> public weightTableCompiled GetOrCreateTFIDF_DLC(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); FileInfo TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing); weightTableCompiled TFIDF_DLC = null; if (TFIDF_DLC_File.Exists && __useExisting) { TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF"); loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain); return(TFIDF_DLC); } if (evaluator == null) { evaluator = __wRecord.tRecord.evaluator; } // <--------------- evaluator selection if (imbWEMManager.settings.TFIDF.doUseHeuristicDLCTFIDFConstruction) { TFIDF_DLC = GetOrCreateTFIDF_DLC_Heuristic(__wRecord, loger, __useExisting, __saveToCache, evaluator); } else { loger.log("DLC TF-IDF construction for: " + idomain.domain + " initiated."); termDocumentSet domainSet = new termDocumentSet("DomainTFIDF_source"); var tLoaded = __wRecord.context.targets.GetLoaded(); int tc = tLoaded.Count; int ti = 0; int ts = 10; int c = 0; int input_c = 0; int output_c = 0; double io_r = 0; foreach (spiderTarget target in tLoaded) { ti++; c++; double tp = ti.GetRatio(tc); if (target.IsRelevant) { var wordlist = GetTermsForPage(target, idomain, null, evaluator, loger); input_c += wordlist.Count; termDocument pageTF = domainSet.AddTable(target.pageHash) as termDocument; pageTF.expansion = 1; pageTF.AddTokens(wordlist, loger); output_c += pageTF.Count(); } if (c > 10) { c = 0; io_r = output_c.GetRatio(input_c); aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "] Semantic compression rate: " + io_r.ToString("P2"), loger, false, 0); } } loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainSet.CountAllDocuments() + "]"); TFIDF_DLC = domainSet.AggregateDocument.GetCompiledTable(loger); TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain; } idomain.Lemmas = TFIDF_DLC.Count; if (__saveToCache) { if (TFIDF_DLC.SaveAs(TFIDF_DLC_File.FullName, getWritableFileMode.overwrite)) { loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table cache saved to: " + TFIDF_DLC_File.FullName); } else { loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table save failed"); } } imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); return(TFIDF_DLC); }
public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null) { if (idomain == null) { idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain); } if (ipage == null) { ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url)); } List <string> output = new List <string>(); FileInfo file = GetWordList_File(idomain, ipage); if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists) { output = file.FullName.openFileToList(true); return(output); } string cont = target.pageText.transliterate(); // cont = cont.imbHtmlDecode(); termDocument pageTF = null; if (evaluator == null) { evaluator = target.parent.wRecord.tRecord.evaluator; } multiLanguageEvaluation evaluation = evaluator.evaluate(cont); if (evaluation.result_language == basicLanguageEnum.serbian) { List <string> pt = new List <string>(); pt.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { pt.AddRange(evaluation.multiLanguageTokens); } pt.RemoveAll(x => !x.isCleanWord()); pt.RemoveAll(x => x.isSymbolicContentOnly()); var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt); output.AddRange(tkns); } if (imbWEMManager.settings.TFIDF.doSavePageWordlist) { output.saveContentOnFilePath(file.FullName); } return(output); }
public weightTableCompiled GetOrCreateTFIDF_DLC_Heuristic(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); FileInfo TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing); weightTableCompiled TFIDF_DLC = null; if (TFIDF_DLC_File.Exists && __useExisting) { TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF"); loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain); return(TFIDF_DLC); } // <--------------- evaluator selection if (evaluator == null) { evaluator = __wRecord.tRecord.evaluator; } loger.log("DLC TF-IDF heuristic construction for: " + idomain.domain + " initiated."); termDocument domainTable = new termDocument(); domainTable.expansion = 1; double tp = 0; var DLCTerms = GetDLCTerms_Heuristics(__wRecord, loger, __useExisting, __useExisting, evaluator, idomain); domainTable.AddTokens(DLCTerms, loger); tp = domainTable.Count().GetRatio(DLCTerms.Count); // allTerms.Count.GetRatio(tc); loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainTable.Count() + "] - Semantic compression: " + tp.ToString("P2")); TFIDF_DLC = domainTable.GetCompiledTable(loger); TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain; return(TFIDF_DLC); }
//public webSitePageTFSet GetTFIDF_MasterConstruct() //{ // if (globalTFIDFSet == null) // { // globalTFIDFSet = new webSitePageTFSet(SessionID, "Temporary TF-IDF table for master table construction"); // } // return globalTFIDFSet; //} private List <string> GetDLCTerms_Heuristics(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator, indexDomain idomain) { List <string> allTerms = new List <string>(); List <string> DLCTerms = new List <string>(); FileInfo dlcWordList = GetWordList_File(idomain); if (dlcWordList.Exists && __useExisting) { DLCTerms = dlcWordList.FullName.openFileToList(true); return(DLCTerms); } var tLoaded = __wRecord.context.targets.GetLoaded(); int tc = tLoaded.Count; int ti = 0; int ts = 10; int c = 0; double tp = 0; foreach (spiderTarget target in tLoaded) { ti++; c++; tp = ti.GetRatio(tc); if (target.IsRelevant) { string cont = target.pageText.transliterate(); cont = WebUtility.HtmlDecode(cont); // cont = cont.imbHtmlDecode(); allTerms.AddRange(cont.getTokens(true, true, true, true, 4)); //, loger); } if (c > 10) { c = 0; aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "]", loger, false, 0); } } multiLanguageEvaluation evaluation = evaluator.evaluate(allTerms, null, null); DLCTerms.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { DLCTerms.AddRange(evaluation.multiLanguageTokens); } DLCTerms = semanticLexiconManager.lexiconCache.encodeTwins(DLCTerms); if (imbWEMManager.settings.TFIDF.doSaveDomainWordList) { if (__saveToCache) { DLCTerms.saveContentOnFilePath(dlcWordList.FullName); } } return(DLCTerms); }