public List <weightTableCompiled> GetTFIDF_DLC_AllCached(builderForLog loger = null) { List <weightTableCompiled> allDLC_TFIDFs = new List <weightTableCompiled>(); List <string> DLC_TFIDF_Files = TFIDF_ConstructFolder.findFiles("dlc_*.xml"); if (loger != null) { loger.log("[" + DLC_TFIDF_Files.Count + "] DLC TFIDF files detected in the cache folder [" + TFIDF_ConstructFolder.path + "]"); } int tc = DLC_TFIDF_Files.Count; double tr = 0; int c = 0; foreach (string fPath in DLC_TFIDF_Files) { c++; weightTableCompiled dlc = new weightTableCompiled(fPath, true, c.ToString("D5")); allDLC_TFIDFs.Add(dlc); tr = c.GetRatio(tc); if (loger != null) { aceLog.consoleControl.writeToConsole(tr.ToString("P2") + " ", loger, false, 0); } } return(allDLC_TFIDFs); }
public void StartSession(string __CrawlID, indexPerformanceEntry __indexID, string __SessionID, ICrawlJobContext __state) { CrawlID = __CrawlID; SessionID = __SessionID; state = __state; var tmpFolder = new folderNode("reportOutput", "reporting module", ""); sessionReportFolder = tmpFolder.createDirectory(SessionID, "", imbWEMManager.settings.directReportEngine.doAutoRenameSessionFolder); // Directory.CreateDirectory(path); TestID = CrawlID + "-" + SessionID; ReportPath = sessionReportFolder.path; sessionCrawlerFolder = sessionReportFolder.createDirectory(__CrawlID, "Report folder for Crawl [" + __CrawlID + "] - part of session: " + SessionID); indexSubFolder = imbWEMManager.index.folder; //.createDirectory(__indexID.IndexRepository, "Index folder fo sub index", false); TFIDF_ConstructFolder = imbWEMManager.index.folder; //.createDirectory(SessionID, "TFIDF cache files for this session", false); FileInfo master_file = GetTFIDF_Master_File(); if (globalTFIDFCompiled == null) { globalTFIDFCompiled = new weightTableCompiled(master_file.FullName, true, SessionID); globalTFIDFCompiled.ReadOnlyMode = true; } SampleRandomOrder = imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleOrder; //SampleSource = state.sampleTags.add(state.sampleFile, ";"); SampleListHash = randomizeSample(); }
// <--------------------------- data getters public void Dispose() { timeseries = null; lastInput = null; lastOutput = null; currentModuleData = null; web = null; wProfile = null; listOfDuplicatedPages = null; resultPageSet = null; spiderTaskResults = null; crossLinkStats = null; //_crawlerContext = null; linkHierarchy = null; _MasterTFIDF = null; }
public weightTableCompiled GetOrCreateTFIDF_DLC_Heuristic(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); FileInfo TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing); weightTableCompiled TFIDF_DLC = null; if (TFIDF_DLC_File.Exists && __useExisting) { TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF"); loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain); return(TFIDF_DLC); } // <--------------- evaluator selection if (evaluator == null) { evaluator = __wRecord.tRecord.evaluator; } loger.log("DLC TF-IDF heuristic construction for: " + idomain.domain + " initiated."); termDocument domainTable = new termDocument(); domainTable.expansion = 1; double tp = 0; var DLCTerms = GetDLCTerms_Heuristics(__wRecord, loger, __useExisting, __useExisting, evaluator, idomain); domainTable.AddTokens(DLCTerms, loger); tp = domainTable.Count().GetRatio(DLCTerms.Count); // allTerms.Count.GetRatio(tc); loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainTable.Count() + "] - Semantic compression: " + tp.ToString("P2")); TFIDF_DLC = domainTable.GetCompiledTable(loger); TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain; return(TFIDF_DLC); }
public weightTableCompiled LoadNewTFIDF_Master(bool loadNew = true) { if (loadNew) { FileInfo master_file = GetTFIDF_Master_File(); var output = new weightTableCompiled(master_file.FullName, true, SessionID); output.ReadOnlyMode = true; return(output); } else { FileInfo master_file = GetTFIDF_Master_File(); if (globalTFIDFCompiled == null) { globalTFIDFCompiled = new weightTableCompiled(master_file.FullName, true, SessionID); globalTFIDFCompiled.ReadOnlyMode = true; } return(globalTFIDFCompiled); } }
public override void eventDLCFinished(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { weightTableCompiled DLC_TDF = __session.GetOrCreateTFIDF_DLC(__wRecord, loger, imbWEMManager.settings.TFIDF.doUseCachedDLCTables, imbWEMManager.settings.TFIDF.doSaveCacheOfDLCTables, evaluator); // domainTF_IDF.Add(__wRecord.domain, DLC_TDF); /* * * * indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); * * List<indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domain); * * * loger.log("[" + idomain.domain + "] application of DLC TF-IDF"); * * * allterms = new List<string>(); * List<String> DLCTerms = new List<string>(); * ti = 0; * tc = pages.Count; * foreach (indexPage ipage in pages) * { * //if (ipage.relevancy == indexPageRelevancyEnum.isRelevant) * //{ * * spiderTarget tPage = __wRecord.context.targets.GetByURL(ipage.url); // tLoaded.FirstOrDefault(x => (x.key == __wRecord.context.targets.GetHash(ipage.url))); * * if (!selected.Contains(tPage)) * { * continue; * } * * if (tPage == null) * { * loger.log("-- page: " + ipage.url + " [not found in the crawler context of: " + idomain.url); * continue; * } * * // __wRecord.context.targets.GetByURL(ipage.url); * termDocument dPage = (termDocument)domainSet[tPage.pageHash]; * * * if (dPage == null) * { * continue; * } * * dPage.expansion = 0; * distinct = new List<string>(); * * * var wt = dPage.GetAllTerms(); * foreach (IWeightTableTerm t in wt) * { * if (dPage.GetBDFreq(t) == 1) * { * distinct.Add(t.nominalForm); * } * allterms.Add(t.nominalForm); * } * * ipage.DistinctLemmas = distinct.toCsvInLine(); * ipage.RelevantTerms = allterms.toCsvInLine(); * ipage.TFIDFcompiled = true; * * DLCTerms.AddRangeUnique(allterms); * * dPage.GetDataTableClean(ipage.HashCode).saveObjectToXML(__session.indexSubFolder.pathFor(GetCompbinedHash(idomain, ipage) + ".xml")); * * ti++; * Double tp = ti.GetRatio(tc); * aceLog.consoleControl.writeToConsole(tp.ToString("P2"), loger, false, 0); * * imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage); * //} * } * * loger.log("[" + idomain.domain + "] application of DLC TF-IDF (done)"); * * * loger.log("[" + idomain.domain + "] constructing DLC TF-IDF for Master TF-IDF (semantic compression)"); * * // ------------- * //var sparks = DLCTerms.getSparks(1, loger, false); * * webPageTF wTFIDF = globalTFIDFConstruct.AddTable(idomain.HashCode) as webPageTF; * * wTFIDF.AddPageTerms(allterms, 0, loger); * * //wTFIDF.AddTokens(DLCTerms, loger); * * String path = __session.indexSubFolder.pathFor(idomain.HashCode + ".xml").getWritableFile().FullName; * wTFIDF.GetDataTable("Lemma" + idomain.domain, null, false).saveObjectToXML(path); * * * * idomain.Lemmas = wTFIDF.Count(); * * imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); * * loger.log("[" + idomain.domain + "] TF-IDF operations done"); */ }
/// <summary> /// Performs full domain reevaluation /// </summary> /// <param name="settings">The settings.</param> /// <param name="loger">The loger.</param> /// <param name="__wRecord">The w record.</param> /// <param name="evaluator">The evaluator.</param> public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF) { indexDomain idomain = null; //lock (updateIndexLockD) //{ idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); // } idomain.url = __wRecord.domain; //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true); double dIP = 0; int p = 0; List <string> dTerms = new List <string>(); List <string> dDistinctTerms = new List <string>(); List <string> dLemmas = new List <string>(); List <string> dWords = new List <string>(); List <string> urls = new List <string>(); bool doEvalD = true; foreach (spiderTarget target in __wRecord.context.targets.GetLoaded()) { indexPage ipage = null; // lock (updateIndexLock) // { ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain); // } bool doEval = true; int dLc = 0; if (settings.plugIn_indexDBUpdater_optimizedMode) { if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant))) { doEval = false; if (ipage.AllWords.isNullOrEmpty()) { doEval = true; } if (ipage.AllLemmas.isNullOrEmpty()) { doEval = true; } } } if (doEval) { List <string> terms = new List <string>(); if (ipage.AllWords.isNullOrEmpty()) { terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); } else { terms = ipage.AllWords.SplitSmart(",", "", true); } ipage.AllWords = terms.toCsvInLine(); double IP = 0; List <string> lemmas = new List <string>(); List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms); if (ipage.AllLemmas.isNullOrEmpty()) { // terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); lemmas.AddRange(mchl.Select(x => x.nominalForm)); } else { lemmas = ipage.AllLemmas.SplitSmart(",", "", true); } foreach (weightTableTermCompiled cterm in mchl) { IP += cterm.tf_idf; //dTerms.AddUnique(cterm.nominalForm); if (cterm.df == 1) { dDistinctTerms.AddUnique(cterm.nominalForm); } } ipage.InfoPrize = IP; dIP += IP; ipage.Lemmas = lemmas.Count; ipage.AllLemmas = lemmas.toCsvInLine(); dWords.AddRange(terms); dLemmas.AddRange(lemmas); ipage.Note = "indexUpdate" + SessionID; // lock (updateIndexLockB) // { imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage); // } // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } else { dIP += ipage.InfoPrize; doEvalD = false; // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", " ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } urls.Add(ipage.url); p++; loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); target.Dispose(); } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { if (!doEvalD) { var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain); int dlc_c = dlc_tf.Count; idomain.TFIDFcompiled = (dlc_c > 0); idomain.Lemmas = dlc_c; } else { idomain.Lemmas = dLemmas.Count; idomain.Words = dWords.Count; idomain.TFIDFcompiled = (dLemmas.Count > 0); idomain.DistinctLemmas = dDistinctTerms.toCsvInLine(); idomain.AllLemmas = dLemmas.toCsvInLine(); idomain.AllWords = dWords.toCsvInLine(); } idomain.InfoPrize = dIP; //if (doEvalD) var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls); idomain.relevantPages = urlAssert[indexPageEvaluationEntryState.isRelevant].Count; idomain.notRelevantPages = urlAssert[indexPageEvaluationEntryState.notRelevant].Count; idomain.detected = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count; idomain.Crawled = urlAssert.certainty; idomain.RelevantContentRatio = urlAssert.relevant; string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7")); if (loger != null) { loger.AppendLine(rpp); } } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); } imbWEMManager.index.wRecordsDeployed++; __wRecord.Dispose(); }
/// <summary> /// Gets the or create tfidf DLC. /// </summary> /// <param name="__wRecord">The w record.</param> /// <param name="loger">The loger.</param> /// <param name="__useExisting">if set to <c>true</c> [use existing].</param> /// <param name="__saveToCache">if set to <c>true</c> [save to cache].</param> /// <param name="evaluator">The evaluator.</param> /// <returns></returns> public weightTableCompiled GetOrCreateTFIDF_DLC(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); FileInfo TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing); weightTableCompiled TFIDF_DLC = null; if (TFIDF_DLC_File.Exists && __useExisting) { TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF"); loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain); return(TFIDF_DLC); } if (evaluator == null) { evaluator = __wRecord.tRecord.evaluator; } // <--------------- evaluator selection if (imbWEMManager.settings.TFIDF.doUseHeuristicDLCTFIDFConstruction) { TFIDF_DLC = GetOrCreateTFIDF_DLC_Heuristic(__wRecord, loger, __useExisting, __saveToCache, evaluator); } else { loger.log("DLC TF-IDF construction for: " + idomain.domain + " initiated."); termDocumentSet domainSet = new termDocumentSet("DomainTFIDF_source"); var tLoaded = __wRecord.context.targets.GetLoaded(); int tc = tLoaded.Count; int ti = 0; int ts = 10; int c = 0; int input_c = 0; int output_c = 0; double io_r = 0; foreach (spiderTarget target in tLoaded) { ti++; c++; double tp = ti.GetRatio(tc); if (target.IsRelevant) { var wordlist = GetTermsForPage(target, idomain, null, evaluator, loger); input_c += wordlist.Count; termDocument pageTF = domainSet.AddTable(target.pageHash) as termDocument; pageTF.expansion = 1; pageTF.AddTokens(wordlist, loger); output_c += pageTF.Count(); } if (c > 10) { c = 0; io_r = output_c.GetRatio(input_c); aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "] Semantic compression rate: " + io_r.ToString("P2"), loger, false, 0); } } loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainSet.CountAllDocuments() + "]"); TFIDF_DLC = domainSet.AggregateDocument.GetCompiledTable(loger); TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain; } idomain.Lemmas = TFIDF_DLC.Count; if (__saveToCache) { if (TFIDF_DLC.SaveAs(TFIDF_DLC_File.FullName, getWritableFileMode.overwrite)) { loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table cache saved to: " + TFIDF_DLC_File.FullName); } else { loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table save failed"); } } imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); return(TFIDF_DLC); }
/// <summary> /// Gets the tfidf master: loads from file or returns any existing instance /// </summary> /// <returns></returns> public weightTableCompiled GetTFIDF_Master(builderForLog loger, bool __useExisting = true, bool __saveToCache = true) { bool rebuild = !__useExisting; FileInfo master_file = GetTFIDF_Master_File(); if (globalTFIDFCompiled == null) { globalTFIDFCompiled = new weightTableCompiled(master_file.FullName, __useExisting, SessionID); globalTFIDFCompiled.ReadOnlyMode = true; } if (globalTFIDFCompiled.Count == 0) { rebuild = true; } else { if (loger != null) { loger.log("Master table loaded [" + globalTFIDFCompiled.Count + "]"); } } if (rebuild) { int input_c = 0; int output_c = 0; List <weightTableCompiled> allDLC_TFIDFs = GetTFIDF_DLC_AllCached(loger); if (loger != null) { loger.log("Rebuilding Master Table "); } termDocumentSet construct = new termDocumentSet(SessionID, "Temporary TF-IDF construct table for session: " + SessionID); int tc = allDLC_TFIDFs.Count; double tr = 0; int c = 0; foreach (weightTableCompiled dlc in allDLC_TFIDFs) { c++; termDocument td = construct.Add(dlc) as termDocument; input_c += td.Count(); tr = c.GetRatio(tc); if (loger != null) { aceLog.consoleControl.writeToConsole(tr.ToString("P2") + " ", loger, false, 0); } // output_c = construct.AggregateDocument.Count(); } globalTFIDFCompiled = construct.AggregateDocument.GetCompiledTable(loger); output_c = construct.AggregateDocument.Count(); tr = input_c.GetRatio(output_c); if (loger != null) { loger.log("Master Table - final semantic compression rate: [" + tr.ToString("P2") + "]"); } } if (__saveToCache) { if (loger != null) { loger.log("Master Table saved to:[" + master_file.FullName + "]"); // Namesemantic compression rate: [" + tr.ToString("P2") + "]"); } globalTFIDFCompiled.SaveAs(master_file.FullName, getWritableFileMode.overwrite); } return(globalTFIDFCompiled); }
public override void OnLoaded() { TermTable = new weightTableCompiled(TermTablePath, true, nameof(TermTable)); }