public List <weightTableCompiled> GetTFIDF_DLC_AllCached(builderForLog loger = null) { List <weightTableCompiled> allDLC_TFIDFs = new List <weightTableCompiled>(); List <string> DLC_TFIDF_Files = TFIDF_ConstructFolder.findFiles("dlc_*.xml"); if (loger != null) { loger.log("[" + DLC_TFIDF_Files.Count + "] DLC TFIDF files detected in the cache folder [" + TFIDF_ConstructFolder.path + "]"); } int tc = DLC_TFIDF_Files.Count; double tr = 0; int c = 0; foreach (string fPath in DLC_TFIDF_Files) { c++; weightTableCompiled dlc = new weightTableCompiled(fPath, true, c.ToString("D5")); allDLC_TFIDFs.Add(dlc); tr = c.GetRatio(tc); if (loger != null) { aceLog.consoleControl.writeToConsole(tr.ToString("P2") + " ", loger, false, 0); } } return(allDLC_TFIDFs); }
/// <summary> /// Deprecated /// </summary> /// <param name="plugin_className">Name of the plugin class.</param> /// <param name="loger">The loger.</param> /// <param name="collection">The collection.</param> /// <returns></returns> public static plugIn_base GetPluginInstance(string plugin_className, builderForLog loger, IAPlugInCollectionBase collection) { plugIn_base plug = null; if (imbWEMManager.settings.supportEngine.plugins.Keys.Contains(plugin_className)) { plug = imbWEMManager.settings.supportEngine.plugins[plugin_className].getInstance() as plugIn_base; if (plug is indexPlugIn_base) { indexPlugIn_base plug_indexPlugIn_base = plug as indexPlugIn_base; loger.log("Plugin instance [" + plug.name + "] for Index Engine created"); } else if (plug is enginePlugIn_base) { loger.log("Plugin instance [" + plug.name + "] for Crawl Job Engine created"); //imbWEMManager.index.plugins.installPlugIn(plug as IPlugInCommonBase); } else if (plug is crawlerPlugIn_base) { loger.log("Plugin instance [" + plug.name + "] for Crawler created"); //imbWEMManager.index.plugins.installPlugIn(plug as IPlugInCommonBase); } else if (plug is reportPlugIn_base) { loger.log("Plugin instance [" + plug.name + "] for Reporting created"); // imbWEMManager.index.plugins.installPlugIn(plug as IPlugInCommonBase); } else { loger.log("Plugin instance [" + plug.name + "] of unknown category created... "); } if (collection != null) { collection.installPlugIn(plug); } } else { loger.AppendLine("Plugin [" + plugin_className + "] not found... "); } return(plug); }
public override void OnLoaded() { loger = new builderForLog(folder.pathFor("log.txt"), true, getWritableFileMode.appendFile); loger.log("Repository [" + name + "] accessed"); siteTable = new objectTable <imbMCWebSiteEntry>(folder.pathFor("siteTable.xml"), true, nameof(imbMCWebSiteEntry.domain), "siteTable"); siteTable.description = "Index datatable with all stored MCWebSite repo-entries"; }
public weightTableCompiled GetOrCreateTFIDF_DLC_Heuristic(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); FileInfo TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing); weightTableCompiled TFIDF_DLC = null; if (TFIDF_DLC_File.Exists && __useExisting) { TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF"); loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain); return(TFIDF_DLC); } // <--------------- evaluator selection if (evaluator == null) { evaluator = __wRecord.tRecord.evaluator; } loger.log("DLC TF-IDF heuristic construction for: " + idomain.domain + " initiated."); termDocument domainTable = new termDocument(); domainTable.expansion = 1; double tp = 0; var DLCTerms = GetDLCTerms_Heuristics(__wRecord, loger, __useExisting, __useExisting, evaluator, idomain); domainTable.AddTokens(DLCTerms, loger); tp = domainTable.Count().GetRatio(DLCTerms.Count); // allTerms.Count.GetRatio(tc); loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainTable.Count() + "] - Semantic compression: " + tp.ToString("P2")); TFIDF_DLC = domainTable.GetCompiledTable(loger); TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain; return(TFIDF_DLC); }
/// <summary> /// Updates log output with new line /// </summary> /// <param name="message"></param> public static void log(this String message) { terminal.log(message); }
/// <summary> /// Gets the or create tfidf DLC. /// </summary> /// <param name="__wRecord">The w record.</param> /// <param name="loger">The loger.</param> /// <param name="__useExisting">if set to <c>true</c> [use existing].</param> /// <param name="__saveToCache">if set to <c>true</c> [save to cache].</param> /// <param name="evaluator">The evaluator.</param> /// <returns></returns> public weightTableCompiled GetOrCreateTFIDF_DLC(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); FileInfo TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing); weightTableCompiled TFIDF_DLC = null; if (TFIDF_DLC_File.Exists && __useExisting) { TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF"); loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain); return(TFIDF_DLC); } if (evaluator == null) { evaluator = __wRecord.tRecord.evaluator; } // <--------------- evaluator selection if (imbWEMManager.settings.TFIDF.doUseHeuristicDLCTFIDFConstruction) { TFIDF_DLC = GetOrCreateTFIDF_DLC_Heuristic(__wRecord, loger, __useExisting, __saveToCache, evaluator); } else { loger.log("DLC TF-IDF construction for: " + idomain.domain + " initiated."); termDocumentSet domainSet = new termDocumentSet("DomainTFIDF_source"); var tLoaded = __wRecord.context.targets.GetLoaded(); int tc = tLoaded.Count; int ti = 0; int ts = 10; int c = 0; int input_c = 0; int output_c = 0; double io_r = 0; foreach (spiderTarget target in tLoaded) { ti++; c++; double tp = ti.GetRatio(tc); if (target.IsRelevant) { var wordlist = GetTermsForPage(target, idomain, null, evaluator, loger); input_c += wordlist.Count; termDocument pageTF = domainSet.AddTable(target.pageHash) as termDocument; pageTF.expansion = 1; pageTF.AddTokens(wordlist, loger); output_c += pageTF.Count(); } if (c > 10) { c = 0; io_r = output_c.GetRatio(input_c); aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "] Semantic compression rate: " + io_r.ToString("P2"), loger, false, 0); } } loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainSet.CountAllDocuments() + "]"); TFIDF_DLC = domainSet.AggregateDocument.GetCompiledTable(loger); TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain; } idomain.Lemmas = TFIDF_DLC.Count; if (__saveToCache) { if (TFIDF_DLC.SaveAs(TFIDF_DLC_File.FullName, getWritableFileMode.overwrite)) { loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table cache saved to: " + TFIDF_DLC_File.FullName); } else { loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table save failed"); } } imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); return(TFIDF_DLC); }
/// <summary> /// Gets the tfidf master: loads from file or returns any existing instance /// </summary> /// <returns></returns> public weightTableCompiled GetTFIDF_Master(builderForLog loger, bool __useExisting = true, bool __saveToCache = true) { bool rebuild = !__useExisting; FileInfo master_file = GetTFIDF_Master_File(); if (globalTFIDFCompiled == null) { globalTFIDFCompiled = new weightTableCompiled(master_file.FullName, __useExisting, SessionID); globalTFIDFCompiled.ReadOnlyMode = true; } if (globalTFIDFCompiled.Count == 0) { rebuild = true; } else { if (loger != null) { loger.log("Master table loaded [" + globalTFIDFCompiled.Count + "]"); } } if (rebuild) { int input_c = 0; int output_c = 0; List <weightTableCompiled> allDLC_TFIDFs = GetTFIDF_DLC_AllCached(loger); if (loger != null) { loger.log("Rebuilding Master Table "); } termDocumentSet construct = new termDocumentSet(SessionID, "Temporary TF-IDF construct table for session: " + SessionID); int tc = allDLC_TFIDFs.Count; double tr = 0; int c = 0; foreach (weightTableCompiled dlc in allDLC_TFIDFs) { c++; termDocument td = construct.Add(dlc) as termDocument; input_c += td.Count(); tr = c.GetRatio(tc); if (loger != null) { aceLog.consoleControl.writeToConsole(tr.ToString("P2") + " ", loger, false, 0); } // output_c = construct.AggregateDocument.Count(); } globalTFIDFCompiled = construct.AggregateDocument.GetCompiledTable(loger); output_c = construct.AggregateDocument.Count(); tr = input_c.GetRatio(output_c); if (loger != null) { loger.log("Master Table - final semantic compression rate: [" + tr.ToString("P2") + "]"); } } if (__saveToCache) { if (loger != null) { loger.log("Master Table saved to:[" + master_file.FullName + "]"); // Namesemantic compression rate: [" + tr.ToString("P2") + "]"); } globalTFIDFCompiled.SaveAs(master_file.FullName, getWritableFileMode.overwrite); } return(globalTFIDFCompiled); }