public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { if (!__wRecord.tRecord.instance.settings.FRONTIER_doLinkHarvest) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); List <indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName); var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault(); var spage = seedTarget?.page; if (spage != null) { loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url); } //FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain); foreach (indexPage p in pages) { link l = new link(p.url); // if (__wRecord.web.webActiveLinks.Contains()) __wRecord.context.processLink(l, spage, false); } } }
/// <summary> /// Gets the index information. /// </summary> /// <returns></returns> public indexDomain GetIndexInfo() { if (indexDomainInfo == null) { indexDomainInfo = imbWEMManager.index.domainIndexTable.GetDomain(domainInfo.domainName); } return(indexDomainInfo); }
public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { // imbWEMManager.index.domainIndexTable var state = __session.state; indexDomain idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); List <indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName); /* * __session.state.crawler.settings.FRONTIER_doLinkHarvest = false; * __session.state.crawler.settings.FRONTIER_doLinkResolver = false; */ var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault(); //.webPages.items.Values.First(); var spage = seedTarget?.page; if (spage != null) { loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url); } FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain); if ((!dlcFile.Exists) || imbWEMManager.settings.TFIDF.doSchedulePagesWithDLCTable) { foreach (indexPage p in pages) { link l = new link(p.url); if (!p.url.Contains(__wRecord.domainInfo.domainRootName)) { loger.AppendLine(__wRecord.domain + " -X-> " + p.url + " Wrong link association?"); aceTerminalInput.doBeepViaConsole(1600, 200, 3); } __wRecord.context.processLink(l, spage, false); } loger.AppendLine(__wRecord.domain + " -> " + __wRecord.web.webActiveLinks.Count + " targets set for load"); } else { loger.AppendLine(__wRecord.domain + " -> DLC cache found: " + dlcFile.FullName); } }
public void SetActiveTargets(modelSpiderSiteRecord wRecord, indexDomain domain) { List <indexPage> pages = domain.getPageSet(); // wRecord.web.setSeedUrl(domain.url); //spiderPage sp = new spiderPage() crawledPage cpage = new crawledPage(domain.url, 0); spiderPage spage = new spiderPage(cpage, 0, 0); foreach (indexPage p in pages) { link l = new link(p.url); wRecord.context.processLink(l, spage, false); } }
public weightTableCompiled GetOrCreateTFIDF_DLC_Heuristic(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); FileInfo TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing); weightTableCompiled TFIDF_DLC = null; if (TFIDF_DLC_File.Exists && __useExisting) { TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF"); loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain); return(TFIDF_DLC); } // <--------------- evaluator selection if (evaluator == null) { evaluator = __wRecord.tRecord.evaluator; } loger.log("DLC TF-IDF heuristic construction for: " + idomain.domain + " initiated."); termDocument domainTable = new termDocument(); domainTable.expansion = 1; double tp = 0; var DLCTerms = GetDLCTerms_Heuristics(__wRecord, loger, __useExisting, __useExisting, evaluator, idomain); domainTable.AddTokens(DLCTerms, loger); tp = domainTable.Count().GetRatio(DLCTerms.Count); // allTerms.Count.GetRatio(tc); loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainTable.Count() + "] - Semantic compression: " + tp.ToString("P2")); TFIDF_DLC = domainTable.GetCompiledTable(loger); TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain; return(TFIDF_DLC); }
public indexPage deployTarget(spiderTarget target, modelSpiderSiteRecord wRecord, indexDomain idomain) { indexPage page = pageIndexTable.GetPageForUrl(target.url); //.GetOrCreate(md5.GetMd5Hash(target.url)); if (idomain == null) { idomain = domainIndexTable.GetDomain(wRecord.domainInfo.domainName); } page.url = target.url; page.tst = target.tokens.ToList().toCsvInLine(","); page.domain = wRecord.domainInfo.domainName; if (target.isLoaded) { if (target.evaluation != null) { if (target.evaluation.result_language != basicLanguageEnum.unknown) { page.langTestRatio = target.evaluation.result_ratio; page.singleMatchTokens = target.evaluation.singleLanguageTokens.toCsvInLine(","); page.multiMatchTokens = target.evaluation.multiLanguageTokens.toCsvInLine(","); page.wordCount = target.evaluation.allContentTokens.Count(); page.AllWords = target.evaluation.allContentTokens.toCsvInLine(); page.language = target.evaluation.result_language.ToString(); } } if (target.IsRelevant) { page.relevancy = indexPageRelevancyEnum.isRelevant; } else if (target.evaluatedLanguage == basicLanguageEnum.unknown) { page.relevancy = indexPageRelevancyEnum.unknown; } else { page.relevancy = indexPageRelevancyEnum.notRelevant; } page.byteSize = target.page.spiderResult.page.result.byteSize; } pageIndexTable.AddOrUpdate(page); return(page); }
/// <summary> /// Deploys information from wRecord, including the key /// </summary> /// <param name="wRecord">The w record.</param> public void deploy(modelSpiderSiteRecord wRecord) { double i_lm_harvest = 0; double i_lm_recall = 0; double i_pi_harvest = 0; double i_pi_nominal = 0; dataUnitSpiderIteration spi_first = wRecord.timeseries.GetData().FirstOrDefault() as dataUnitSpiderIteration; dataUnitSpiderIteration spi_last = wRecord.timeseries.lastEntry as dataUnitSpiderIteration; dataUnitSpiderIteration spi_current = wRecord.timeseries.currentEntry as dataUnitSpiderIteration; if (spi_current != null) { time_duration_s = creationTime.Subtract(spi_current.rowCreated).TotalSeconds; } else { time_duration_s = 0; } // if (spi_last != null) time_duration_gross_s = creationTime.Subtract(spi_last.rowCreated).TotalSeconds; else time_duration_gross_s = 0; if (spi_first != null) { time_sincefirst_s = creationTime.Subtract(spi_first.rowCreated).TotalSeconds; } else { time_sincefirst_s = 0; } indexDomain idomain = wRecord.GetIndexInfo(); // imbWEMManager.index.domainIndexTable.GetDomain(wRecord.domainInfo.domainName); iteration = wRecord.iteration; blocks_all = wRecord.context.targets.blocks.Count(false); blocks_relevant = wRecord.context.targets.blocks.Count(true); terms_all = wRecord.context.targets.termsAll.Count(); terms_relevant = wRecord.context.targets.termSerbian.Count(); var TFIDF = wRecord.MasterTFIDF; // imbWEMManager.index.experimentEntry.globalTFIDFCompiled; var mchs = TFIDF.GetMatches(wRecord.context.targets.termSerbian); //TFIDF.GetScoreAggregate() key = wRecord.domainInfo.domainName + iteration.ToString("D3"); int relCount = 0; int irelCount = 0; int lCount = 0; int rCount = 0; int dCount = 0; double fraDuration = 0; int modulesContained = 0; int rec = 0; foreach (frontierRankingAlgorithmIterationRecord gen in wRecord.frontierDLC.generalRecords) { rec++; fraDuration += gen.duration; } FRA_SummaryRuntime = fraDuration.GetRatio((double)rec); FRA_TimePercent = FRA_SummaryRuntime.GetRatio(time_duration_s); var rtake = wRecord.tRecord.measureTaker.GetLastTake(); if (rtake != null) { CPU = rtake.cpuRateOfProcess; } if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { foreach (moduleDLCRecord mod in wRecord.frontierDLC) { if (mod != null) { modulesContained += mod.GetLastEntry().accumulated; } } } List <string> hashList = new List <string>(); List <spiderTarget> nonDuplicate = new List <spiderTarget>(); foreach (spiderTarget t in wRecord.context.targets.GetLoaded()) { indexPage ipage = t.GetIndexPage(); //imbWEMManager.index.pageIndexTable.GetPageForUrl(t.url); //i_pi_harvest += ipage.InfoPrize; if (ipage != null) { i_pi_nominal += ipage.InfoPrize; } bool isDuplicate = t.isDuplicate; if (isDuplicate) { if (!hashList.Contains(t.pageHash)) { hashList.Add(t.pageHash); isDuplicate = false; } } if (!isDuplicate) { if (t.IsRelevant) { relCount++; } else { irelCount++; } lCount++; nonDuplicate.Add(t); } else { dCount++; } } relevantPageCount = relCount; irrelevantPageCount = irelCount; loadedPageCount = lCount; duplicateCount = dCount; int mchs_c = 0; int id_lm_c = 0; if (idomain != null) { id_lm_c = idomain.Lemmas; } if (mchs != null) { mchs_c = mchs.Count(); } i_lm_harvest = mchs_c.GetRatio(loadedPageCount); IP = TFIDF.GetScoreForMatch(wRecord.context.targets.termSerbian); i_lm_recall = mchs_c.GetRatio(id_lm_c); if (i_lm_recall > 1) { i_lm_recall = 1; } if (idomain != null) { IP_recall = i_pi_nominal.GetRatio(idomain.InfoPrize).ClipToK(); } if (idomain != null) { Term_recall = wRecord.context.targets.termSerbian.Count().GetRatio(idomain.Words).ClipToK(); } Page_recall = relevantPageCount.GetRatio(wRecord.pageRecallTarget).ClipToK(); i_pi_nominal = i_pi_nominal.GetRatio(loadedPageCount); i_pi_harvest = IP.GetRatio((double)lCount); spiderTaskResult lastResult = null; foreach (spiderTaskResult r in wRecord.spiderTaskResults) { lastResult = r; rCount = rCount + r.Count; } realLoadsCount = rCount; if (lastResult != null) { targetUrl = ""; targetLanguage = ""; targetEvalRatio = ""; foreach (spiderTaskResultItem item in lastResult.items.Values) { targetUrl = targetUrl.add(item.target.url, ","); var t = wRecord.context.targets.GetByTarget(item.target); if (t != null) { if (t.evaluation != null) { targetLanguage = targetLanguage.add(t.evaluatedLanguage.ToString(), ";"); targetEvalRatio = targetEvalRatio.add(t.evaluation.result_ratio.ToString(), ";"); } else { if (t.isDuplicate) { targetLanguage = targetLanguage.add("duplicate", ";"); targetEvalRatio = targetEvalRatio.add("duplicate", ";"); } else { targetLanguage = targetLanguage.add("unknown", ";"); targetEvalRatio = targetEvalRatio.add("unknown", ";"); } } } } } if ((relevantPageCount == 0) || (loadedPageCount == 0)) { E_PP = 0; } else { E_PP = (double)relevantPageCount / (double)loadedPageCount; } if ((wRecord.context.targets.termSerbian.Count == 0) || (wRecord.context.targets.termsAll.Count == 0) || (loadedPageCount == 0)) { E_TP = 0; E_TH = 0; } else { E_TP = (double)wRecord.context.targets.termSerbian.Count / (double)wRecord.context.targets.termsAll.Count; E_TH = (double)wRecord.context.targets.termSerbian.Count / (double)loadedPageCount; } IPnominal = i_pi_nominal; IP_collected = i_pi_harvest; Lm_collected = i_lm_harvest; Lm_recall = i_lm_recall; }
/// <summary> /// Performs full domain reevaluation /// </summary> /// <param name="settings">The settings.</param> /// <param name="loger">The loger.</param> /// <param name="__wRecord">The w record.</param> /// <param name="evaluator">The evaluator.</param> public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF) { indexDomain idomain = null; //lock (updateIndexLockD) //{ idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); // } idomain.url = __wRecord.domain; //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true); double dIP = 0; int p = 0; List <string> dTerms = new List <string>(); List <string> dDistinctTerms = new List <string>(); List <string> dLemmas = new List <string>(); List <string> dWords = new List <string>(); List <string> urls = new List <string>(); bool doEvalD = true; foreach (spiderTarget target in __wRecord.context.targets.GetLoaded()) { indexPage ipage = null; // lock (updateIndexLock) // { ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain); // } bool doEval = true; int dLc = 0; if (settings.plugIn_indexDBUpdater_optimizedMode) { if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant))) { doEval = false; if (ipage.AllWords.isNullOrEmpty()) { doEval = true; } if (ipage.AllLemmas.isNullOrEmpty()) { doEval = true; } } } if (doEval) { List <string> terms = new List <string>(); if (ipage.AllWords.isNullOrEmpty()) { terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); } else { terms = ipage.AllWords.SplitSmart(",", "", true); } ipage.AllWords = terms.toCsvInLine(); double IP = 0; List <string> lemmas = new List <string>(); List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms); if (ipage.AllLemmas.isNullOrEmpty()) { // terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); lemmas.AddRange(mchl.Select(x => x.nominalForm)); } else { lemmas = ipage.AllLemmas.SplitSmart(",", "", true); } foreach (weightTableTermCompiled cterm in mchl) { IP += cterm.tf_idf; //dTerms.AddUnique(cterm.nominalForm); if (cterm.df == 1) { dDistinctTerms.AddUnique(cterm.nominalForm); } } ipage.InfoPrize = IP; dIP += IP; ipage.Lemmas = lemmas.Count; ipage.AllLemmas = lemmas.toCsvInLine(); dWords.AddRange(terms); dLemmas.AddRange(lemmas); ipage.Note = "indexUpdate" + SessionID; // lock (updateIndexLockB) // { imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage); // } // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } else { dIP += ipage.InfoPrize; doEvalD = false; // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", " ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } urls.Add(ipage.url); p++; loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); target.Dispose(); } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { if (!doEvalD) { var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain); int dlc_c = dlc_tf.Count; idomain.TFIDFcompiled = (dlc_c > 0); idomain.Lemmas = dlc_c; } else { idomain.Lemmas = dLemmas.Count; idomain.Words = dWords.Count; idomain.TFIDFcompiled = (dLemmas.Count > 0); idomain.DistinctLemmas = dDistinctTerms.toCsvInLine(); idomain.AllLemmas = dLemmas.toCsvInLine(); idomain.AllWords = dWords.toCsvInLine(); } idomain.InfoPrize = dIP; //if (doEvalD) var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls); idomain.relevantPages = urlAssert[indexPageEvaluationEntryState.isRelevant].Count; idomain.notRelevantPages = urlAssert[indexPageEvaluationEntryState.notRelevant].Count; idomain.detected = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count; idomain.Crawled = urlAssert.certainty; idomain.RelevantContentRatio = urlAssert.relevant; string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7")); if (loger != null) { loger.AppendLine(rpp); } } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); } imbWEMManager.index.wRecordsDeployed++; __wRecord.Dispose(); }
/// <summary> /// Gets the or create tfidf DLC. /// </summary> /// <param name="__wRecord">The w record.</param> /// <param name="loger">The loger.</param> /// <param name="__useExisting">if set to <c>true</c> [use existing].</param> /// <param name="__saveToCache">if set to <c>true</c> [save to cache].</param> /// <param name="evaluator">The evaluator.</param> /// <returns></returns> public weightTableCompiled GetOrCreateTFIDF_DLC(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); FileInfo TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing); weightTableCompiled TFIDF_DLC = null; if (TFIDF_DLC_File.Exists && __useExisting) { TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF"); loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain); return(TFIDF_DLC); } if (evaluator == null) { evaluator = __wRecord.tRecord.evaluator; } // <--------------- evaluator selection if (imbWEMManager.settings.TFIDF.doUseHeuristicDLCTFIDFConstruction) { TFIDF_DLC = GetOrCreateTFIDF_DLC_Heuristic(__wRecord, loger, __useExisting, __saveToCache, evaluator); } else { loger.log("DLC TF-IDF construction for: " + idomain.domain + " initiated."); termDocumentSet domainSet = new termDocumentSet("DomainTFIDF_source"); var tLoaded = __wRecord.context.targets.GetLoaded(); int tc = tLoaded.Count; int ti = 0; int ts = 10; int c = 0; int input_c = 0; int output_c = 0; double io_r = 0; foreach (spiderTarget target in tLoaded) { ti++; c++; double tp = ti.GetRatio(tc); if (target.IsRelevant) { var wordlist = GetTermsForPage(target, idomain, null, evaluator, loger); input_c += wordlist.Count; termDocument pageTF = domainSet.AddTable(target.pageHash) as termDocument; pageTF.expansion = 1; pageTF.AddTokens(wordlist, loger); output_c += pageTF.Count(); } if (c > 10) { c = 0; io_r = output_c.GetRatio(input_c); aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "] Semantic compression rate: " + io_r.ToString("P2"), loger, false, 0); } } loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainSet.CountAllDocuments() + "]"); TFIDF_DLC = domainSet.AggregateDocument.GetCompiledTable(loger); TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain; } idomain.Lemmas = TFIDF_DLC.Count; if (__saveToCache) { if (TFIDF_DLC.SaveAs(TFIDF_DLC_File.FullName, getWritableFileMode.overwrite)) { loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table cache saved to: " + TFIDF_DLC_File.FullName); } else { loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table save failed"); } } imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); return(TFIDF_DLC); }
public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null) { if (idomain == null) { idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain); } if (ipage == null) { ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url)); } List <string> output = new List <string>(); FileInfo file = GetWordList_File(idomain, ipage); if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists) { output = file.FullName.openFileToList(true); return(output); } string cont = target.pageText.transliterate(); // cont = cont.imbHtmlDecode(); termDocument pageTF = null; if (evaluator == null) { evaluator = target.parent.wRecord.tRecord.evaluator; } multiLanguageEvaluation evaluation = evaluator.evaluate(cont); if (evaluation.result_language == basicLanguageEnum.serbian) { List <string> pt = new List <string>(); pt.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { pt.AddRange(evaluation.multiLanguageTokens); } pt.RemoveAll(x => !x.isCleanWord()); pt.RemoveAll(x => x.isSymbolicContentOnly()); var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt); output.AddRange(tkns); } if (imbWEMManager.settings.TFIDF.doSavePageWordlist) { output.saveContentOnFilePath(file.FullName); } return(output); }
//public webSitePageTFSet GetTFIDF_MasterConstruct() //{ // if (globalTFIDFSet == null) // { // globalTFIDFSet = new webSitePageTFSet(SessionID, "Temporary TF-IDF table for master table construction"); // } // return globalTFIDFSet; //} private List <string> GetDLCTerms_Heuristics(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator, indexDomain idomain) { List <string> allTerms = new List <string>(); List <string> DLCTerms = new List <string>(); FileInfo dlcWordList = GetWordList_File(idomain); if (dlcWordList.Exists && __useExisting) { DLCTerms = dlcWordList.FullName.openFileToList(true); return(DLCTerms); } var tLoaded = __wRecord.context.targets.GetLoaded(); int tc = tLoaded.Count; int ti = 0; int ts = 10; int c = 0; double tp = 0; foreach (spiderTarget target in tLoaded) { ti++; c++; tp = ti.GetRatio(tc); if (target.IsRelevant) { string cont = target.pageText.transliterate(); cont = WebUtility.HtmlDecode(cont); // cont = cont.imbHtmlDecode(); allTerms.AddRange(cont.getTokens(true, true, true, true, 4)); //, loger); } if (c > 10) { c = 0; aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "]", loger, false, 0); } } multiLanguageEvaluation evaluation = evaluator.evaluate(allTerms, null, null); DLCTerms.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { DLCTerms.AddRange(evaluation.multiLanguageTokens); } DLCTerms = semanticLexiconManager.lexiconCache.encodeTwins(DLCTerms); if (imbWEMManager.settings.TFIDF.doSaveDomainWordList) { if (__saveToCache) { DLCTerms.saveContentOnFilePath(dlcWordList.FullName); } } return(DLCTerms); }
public weightTableCompiled GetTFIDF_DLC(indexDomain idomain, getWritableFileMode mode = getWritableFileMode.newOrExisting) { FileInfo fi = new FileInfo(TFIDF_ConstructFolder.pathFor("dlc_" + idomain.HashCode + ".xml")); //.getWritableFile(mode); return(new weightTableCompiled(fi.FullName, true, idomain.domain)); }
public FileInfo GetTFIDF_DLC_File(indexDomain idomain, getWritableFileMode mode = getWritableFileMode.newOrExisting) { return(TFIDF_ConstructFolder.pathFor("dlc_" + idomain.HashCode + ".xml").getWritableFile(mode)); }
public FileInfo GetWordList_File(indexDomain idomain) { return(TFIDF_ConstructFolder.pathFor("d_words_" + idomain.HashCode + ".txt").getWritableFile(getWritableFileMode.newOrExisting)); }