Ejemplo n.º 1
0
        public List <weightTableCompiled> GetTFIDF_DLC_AllCached(builderForLog loger = null)
        {
            List <weightTableCompiled> allDLC_TFIDFs = new List <weightTableCompiled>();

            List <string> DLC_TFIDF_Files = TFIDF_ConstructFolder.findFiles("dlc_*.xml");

            if (loger != null)
            {
                loger.log("[" + DLC_TFIDF_Files.Count + "] DLC TFIDF files detected in the cache folder [" + TFIDF_ConstructFolder.path + "]");
            }



            int    tc = DLC_TFIDF_Files.Count;
            double tr = 0;
            int    c  = 0;


            foreach (string fPath in DLC_TFIDF_Files)
            {
                c++;
                weightTableCompiled dlc = new weightTableCompiled(fPath, true, c.ToString("D5"));

                allDLC_TFIDFs.Add(dlc);

                tr = c.GetRatio(tc);
                if (loger != null)
                {
                    aceLog.consoleControl.writeToConsole(tr.ToString("P2") + " ", loger, false, 0);
                }
            }
            return(allDLC_TFIDFs);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Deprecated
        /// </summary>
        /// <param name="plugin_className">Name of the plugin class.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="collection">The collection.</param>
        /// <returns></returns>
        public static plugIn_base GetPluginInstance(string plugin_className, builderForLog loger, IAPlugInCollectionBase collection)
        {
            plugIn_base plug = null;

            if (imbWEMManager.settings.supportEngine.plugins.Keys.Contains(plugin_className))
            {
                plug = imbWEMManager.settings.supportEngine.plugins[plugin_className].getInstance()  as plugIn_base;


                if (plug is indexPlugIn_base)
                {
                    indexPlugIn_base plug_indexPlugIn_base = plug as indexPlugIn_base;
                    loger.log("Plugin instance [" + plug.name + "] for Index Engine created");
                }
                else if (plug is enginePlugIn_base)
                {
                    loger.log("Plugin instance [" + plug.name + "] for Crawl Job Engine created");
                    //imbWEMManager.index.plugins.installPlugIn(plug as IPlugInCommonBase);
                }
                else if (plug is crawlerPlugIn_base)
                {
                    loger.log("Plugin instance [" + plug.name + "] for Crawler created");
                    //imbWEMManager.index.plugins.installPlugIn(plug as IPlugInCommonBase);
                }
                else if (plug is reportPlugIn_base)
                {
                    loger.log("Plugin instance [" + plug.name + "] for Reporting created");
                    //    imbWEMManager.index.plugins.installPlugIn(plug as IPlugInCommonBase);
                }
                else
                {
                    loger.log("Plugin instance [" + plug.name + "] of unknown category created... ");
                }


                if (collection != null)
                {
                    collection.installPlugIn(plug);
                }
            }
            else
            {
                loger.AppendLine("Plugin [" + plugin_className + "] not found... ");
            }

            return(plug);
        }
Ejemplo n.º 3
0
        public override void OnLoaded()
        {
            loger = new builderForLog(folder.pathFor("log.txt"), true, getWritableFileMode.appendFile);
            loger.log("Repository [" + name + "] accessed");

            siteTable             = new objectTable <imbMCWebSiteEntry>(folder.pathFor("siteTable.xml"), true, nameof(imbMCWebSiteEntry.domain), "siteTable");
            siteTable.description = "Index datatable with all stored MCWebSite repo-entries";
        }
Ejemplo n.º 4
0
        public weightTableCompiled GetOrCreateTFIDF_DLC_Heuristic(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null)
        {
            indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain);

            FileInfo            TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing);
            weightTableCompiled TFIDF_DLC      = null;

            if (TFIDF_DLC_File.Exists && __useExisting)
            {
                TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF");

                loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain);
                return(TFIDF_DLC);
            }

            // <--------------- evaluator selection

            if (evaluator == null)
            {
                evaluator = __wRecord.tRecord.evaluator;
            }

            loger.log("DLC TF-IDF heuristic construction for: " + idomain.domain + " initiated.");

            termDocument domainTable = new termDocument();

            domainTable.expansion = 1;

            double tp = 0;

            var DLCTerms = GetDLCTerms_Heuristics(__wRecord, loger, __useExisting, __useExisting, evaluator, idomain);



            domainTable.AddTokens(DLCTerms, loger);

            tp = domainTable.Count().GetRatio(DLCTerms.Count); // allTerms.Count.GetRatio(tc);

            loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainTable.Count() + "] - Semantic compression: " + tp.ToString("P2"));

            TFIDF_DLC      = domainTable.GetCompiledTable(loger);
            TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain;

            return(TFIDF_DLC);
        }
Ejemplo n.º 5
0
 /// <summary>
 /// Updates log output with new line
 /// </summary>
 /// <param name="message"></param>
 public static void log(this String message)
 {
     terminal.log(message);
 }
Ejemplo n.º 6
0
        /// <summary>
        /// Gets the or create tfidf DLC.
        /// </summary>
        /// <param name="__wRecord">The w record.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="__useExisting">if set to <c>true</c> [use existing].</param>
        /// <param name="__saveToCache">if set to <c>true</c> [save to cache].</param>
        /// <param name="evaluator">The evaluator.</param>
        /// <returns></returns>
        public weightTableCompiled GetOrCreateTFIDF_DLC(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null)
        {
            indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain);

            FileInfo            TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing);
            weightTableCompiled TFIDF_DLC      = null;

            if (TFIDF_DLC_File.Exists && __useExisting)
            {
                TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF");

                loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain);
                return(TFIDF_DLC);
            }

            if (evaluator == null)
            {
                evaluator = __wRecord.tRecord.evaluator;
            }

            // <--------------- evaluator selection

            if (imbWEMManager.settings.TFIDF.doUseHeuristicDLCTFIDFConstruction)
            {
                TFIDF_DLC = GetOrCreateTFIDF_DLC_Heuristic(__wRecord, loger, __useExisting, __saveToCache, evaluator);
            }
            else
            {
                loger.log("DLC TF-IDF construction for: " + idomain.domain + " initiated.");

                termDocumentSet domainSet = new termDocumentSet("DomainTFIDF_source");

                var tLoaded = __wRecord.context.targets.GetLoaded();
                int tc      = tLoaded.Count;
                int ti      = 0;
                int ts      = 10;
                int c       = 0;

                int    input_c  = 0;
                int    output_c = 0;
                double io_r     = 0;

                foreach (spiderTarget target in tLoaded)
                {
                    ti++;
                    c++;
                    double tp = ti.GetRatio(tc);

                    if (target.IsRelevant)
                    {
                        var wordlist = GetTermsForPage(target, idomain, null, evaluator, loger);
                        input_c += wordlist.Count;

                        termDocument pageTF = domainSet.AddTable(target.pageHash) as termDocument;
                        pageTF.expansion = 1;
                        pageTF.AddTokens(wordlist, loger);

                        output_c += pageTF.Count();
                    }

                    if (c > 10)
                    {
                        c    = 0;
                        io_r = output_c.GetRatio(input_c);
                        aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "] Semantic compression rate: " + io_r.ToString("P2"), loger, false, 0);
                    }
                }

                loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainSet.CountAllDocuments() + "]");

                TFIDF_DLC      = domainSet.AggregateDocument.GetCompiledTable(loger);
                TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain;
            }

            idomain.Lemmas = TFIDF_DLC.Count;

            if (__saveToCache)
            {
                if (TFIDF_DLC.SaveAs(TFIDF_DLC_File.FullName, getWritableFileMode.overwrite))
                {
                    loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table cache saved to: " + TFIDF_DLC_File.FullName);
                }
                else
                {
                    loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table save failed");
                }
            }

            imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);

            return(TFIDF_DLC);
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Gets the tfidf master: loads from file or returns any existing instance
        /// </summary>
        /// <returns></returns>
        public weightTableCompiled GetTFIDF_Master(builderForLog loger, bool __useExisting = true, bool __saveToCache = true)
        {
            bool     rebuild     = !__useExisting;
            FileInfo master_file = GetTFIDF_Master_File();

            if (globalTFIDFCompiled == null)
            {
                globalTFIDFCompiled = new weightTableCompiled(master_file.FullName, __useExisting, SessionID);
                globalTFIDFCompiled.ReadOnlyMode = true;
            }

            if (globalTFIDFCompiled.Count == 0)
            {
                rebuild = true;
            }
            else
            {
                if (loger != null)
                {
                    loger.log("Master table loaded [" + globalTFIDFCompiled.Count + "]");
                }
            }

            if (rebuild)
            {
                int input_c  = 0;
                int output_c = 0;

                List <weightTableCompiled> allDLC_TFIDFs = GetTFIDF_DLC_AllCached(loger);

                if (loger != null)
                {
                    loger.log("Rebuilding Master Table ");
                }

                termDocumentSet construct = new termDocumentSet(SessionID, "Temporary TF-IDF construct table for session: " + SessionID);

                int    tc = allDLC_TFIDFs.Count;
                double tr = 0;
                int    c  = 0;

                foreach (weightTableCompiled dlc in allDLC_TFIDFs)
                {
                    c++;

                    termDocument td = construct.Add(dlc) as termDocument;
                    input_c += td.Count();

                    tr = c.GetRatio(tc);
                    if (loger != null)
                    {
                        aceLog.consoleControl.writeToConsole(tr.ToString("P2") + " ", loger, false, 0);
                    }

                    // output_c = construct.AggregateDocument.Count();
                }

                globalTFIDFCompiled = construct.AggregateDocument.GetCompiledTable(loger);
                output_c            = construct.AggregateDocument.Count();

                tr = input_c.GetRatio(output_c);
                if (loger != null)
                {
                    loger.log("Master Table - final semantic compression rate: [" + tr.ToString("P2") + "]");
                }
            }

            if (__saveToCache)
            {
                if (loger != null)
                {
                    loger.log("Master Table saved to:[" + master_file.FullName + "]");                // Namesemantic compression rate: [" + tr.ToString("P2") + "]");
                }
                globalTFIDFCompiled.SaveAs(master_file.FullName, getWritableFileMode.overwrite);
            }

            return(globalTFIDFCompiled);
        }