Beispiel #1
0
        public void ToString(builderForLog loger)
        {
            foreach (plugInGroupEnum gr in Keys)
            {
                if (this[gr].Count > 0)
                {
                    loger.AppendLine("--- " + gr.ToString() + " [" + this[gr].Count + "]");

                    foreach (IPlugInCommonBase pl in this[gr])
                    {
                        loger.AppendLine(pl.name);
                    }
                }
            }
        }
Beispiel #2
0
        /// <summary>
        /// Deprecated
        /// </summary>
        /// <param name="plugin_className">Name of the plugin class.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="collection">The collection.</param>
        /// <returns></returns>
        public static plugIn_base GetPluginInstance(string plugin_className, builderForLog loger, IAPlugInCollectionBase collection)
        {
            plugIn_base plug = null;

            if (imbWEMManager.settings.supportEngine.plugins.Keys.Contains(plugin_className))
            {
                plug = imbWEMManager.settings.supportEngine.plugins[plugin_className].getInstance()  as plugIn_base;


                if (plug is indexPlugIn_base)
                {
                    indexPlugIn_base plug_indexPlugIn_base = plug as indexPlugIn_base;
                    loger.log("Plugin instance [" + plug.name + "] for Index Engine created");
                }
                else if (plug is enginePlugIn_base)
                {
                    loger.log("Plugin instance [" + plug.name + "] for Crawl Job Engine created");
                    //imbWEMManager.index.plugins.installPlugIn(plug as IPlugInCommonBase);
                }
                else if (plug is crawlerPlugIn_base)
                {
                    loger.log("Plugin instance [" + plug.name + "] for Crawler created");
                    //imbWEMManager.index.plugins.installPlugIn(plug as IPlugInCommonBase);
                }
                else if (plug is reportPlugIn_base)
                {
                    loger.log("Plugin instance [" + plug.name + "] for Reporting created");
                    //    imbWEMManager.index.plugins.installPlugIn(plug as IPlugInCommonBase);
                }
                else
                {
                    loger.log("Plugin instance [" + plug.name + "] of unknown category created... ");
                }


                if (collection != null)
                {
                    collection.installPlugIn(plug);
                }
            }
            else
            {
                loger.AppendLine("Plugin [" + plugin_className + "] not found... ");
            }

            return(plug);
        }
        public void addSynonymsAndConceptLinks(termExploreModel termModel, bool saveModel = false)
        {
            // <----------- ADDING SYNONYMS ----------- >

            var lemmas = manager.getLemma(termModel.lemmaForm);

            var lemmasyns = manager.getLemmas(termModel.synonyms);

            builderForLog linkLog = new builderForLog();

            linkLog.open("Creating synonym-2-lemma links");

            termModel.links_synonym = 0;
            foreach (ITermLemma lemma in lemmas)
            {
                foreach (ITermLemma lemsyn in lemmasyns)
                {
                    bool added = false;
                    if (!lemma.relatedTo.Contains(lemsyn))
                    {
                        if (!lemma.relatedFrom.Contains(lemsyn))
                        {
                            lemma.relatedTo.Add(lemsyn);

                            added = true;
                        }
                    }
                    if (added)
                    {
                        termModel.links_synonym++;
                        linkLog.AppendLine("[" + termModel.links_synonym.ToString("D5") + "] " + lemma.name + " -> " + lemsyn.name);
                    }
                    else
                    {
                        linkLog.AppendLine("[Link exists] " + lemma.name + " -> " + lemsyn.name);
                    }
                }
            }
            linkLog.close();

            linkLog.open("Creating concept 2 lemma links");
            // <----------- ADDING SYNSETS
            List <Concept> concepts = new List <Concept>();
            Concept        conHead  = null;

            termModel.links_lemmaConcept = 0;
            foreach (string code in termModel.wordnetPrimarySymsets)
            {
                if (code.isCleanWord())
                {
                    aceLog.log("wrong symset code -- [" + code + "]  -- ignored!");
                    continue;
                }
                Concept con   = manager.getConcept(code, true, "WordNet Code");
                bool    added = false;
                foreach (TermLemma lemma in lemmas)
                {
                    if (!con.lemmas.Contains(lemma))
                    {
                        con.lemmas.Add(lemma);
                        added = true;
                    }
                    if (added)
                    {
                        termModel.links_lemmaConcept++;
                        linkLog.AppendLine("[" + termModel.links_lemmaConcept.ToString("D5") + "] " + con.name + " -> " + lemma.name);
                    }
                    else
                    {
                        linkLog.AppendLine("[Link exists] " + con.name + " -> " + lemma.name);
                    }
                }

                concepts.Add(con);
                conHead = con;
            }
            linkLog.close();

            linkLog.open("Creating concept 2 concept links");
            // <--------------------------- linking SYNSET concepts
            termModel.links_conceptConcept = 0;
            foreach (Concept con in concepts)
            {
                foreach (Concept con2 in concepts)
                {
                    bool added = false;
                    if (!con2.relatedTo.Contains(con))
                    {
                        if (!con2.relatedFrom.Contains(con))
                        {
                            var sharedLemmas = con2.lemmas.Where(x => con.lemmas.Contains(x));
                            if (sharedLemmas.Count() > 0)
                            {
                                con2.relatedTo.Add(con);
                                added = true;
                            }
                        }
                    }
                    if (added)
                    {
                        termModel.links_conceptConcept++;
                        linkLog.AppendLine("[" + termModel.links_conceptConcept.ToString("D5") + "] " + con2.name + " -> " + con.name);
                    }
                    else
                    {
                        linkLog.AppendLine("[Link exists] " + con2.name + " -> " + con.name);
                    }
                }
            }
            linkLog.close();

            manager.lexiconContext.SaveChanges();
            string pth = projectFolderStructure[lexiconConstructorProjectFolder.links].pathFor(termModel.filename(".txt"));

            linkLog.ToString().saveStringToFile(pth, getWritableFileMode.overwrite);

            if (saveModel)
            {
                saveTermModel(termModel);
            }
        }
        /// <summary>
        /// Performs full domain reevaluation
        /// </summary>
        /// <param name="settings">The settings.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="__wRecord">The w record.</param>
        /// <param name="evaluator">The evaluator.</param>
        public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF)
        {
            indexDomain idomain = null;

            //lock (updateIndexLockD)
            //{
            idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
            // }

            idomain.url = __wRecord.domain;

            //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true);

            double        dIP    = 0;
            int           p      = 0;
            List <string> dTerms = new List <string>();

            List <string> dDistinctTerms = new List <string>();

            List <string> dLemmas = new List <string>();
            List <string> dWords  = new List <string>();


            List <string> urls = new List <string>();

            bool doEvalD = true;



            foreach (spiderTarget target in __wRecord.context.targets.GetLoaded())
            {
                indexPage ipage = null;

                // lock (updateIndexLock)
                // {
                ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain);
                // }
                bool doEval = true;
                int  dLc    = 0;

                if (settings.plugIn_indexDBUpdater_optimizedMode)
                {
                    if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant)))
                    {
                        doEval = false;

                        if (ipage.AllWords.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                        if (ipage.AllLemmas.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                    }
                }

                if (doEval)
                {
                    List <string> terms = new List <string>();

                    if (ipage.AllWords.isNullOrEmpty())
                    {
                        terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);
                    }
                    else
                    {
                        terms = ipage.AllWords.SplitSmart(",", "", true);
                    }


                    ipage.AllWords = terms.toCsvInLine();


                    double IP = 0;

                    List <string> lemmas = new List <string>();

                    List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms);

                    if (ipage.AllLemmas.isNullOrEmpty())
                    {
                        //  terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);

                        lemmas.AddRange(mchl.Select(x => x.nominalForm));
                    }
                    else
                    {
                        lemmas = ipage.AllLemmas.SplitSmart(",", "", true);
                    }



                    foreach (weightTableTermCompiled cterm in mchl)
                    {
                        IP += cterm.tf_idf;
                        //dTerms.AddUnique(cterm.nominalForm);

                        if (cterm.df == 1)
                        {
                            dDistinctTerms.AddUnique(cterm.nominalForm);
                        }
                    }

                    ipage.InfoPrize = IP;

                    dIP += IP;

                    ipage.Lemmas = lemmas.Count;

                    ipage.AllLemmas = lemmas.toCsvInLine();

                    dWords.AddRange(terms);
                    dLemmas.AddRange(lemmas);

                    ipage.Note = "indexUpdate" + SessionID;

                    //   lock (updateIndexLockB)
                    //    {
                    imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage);
                    //   }
                    // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }
                else
                {
                    dIP    += ipage.InfoPrize;
                    doEvalD = false;
                    // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", "  ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }

                urls.Add(ipage.url);

                p++;
                loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                target.Dispose();
            }


            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                if (!doEvalD)
                {
                    var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain);
                    int dlc_c  = dlc_tf.Count;


                    idomain.TFIDFcompiled = (dlc_c > 0);
                    idomain.Lemmas        = dlc_c;
                }
                else
                {
                    idomain.Lemmas         = dLemmas.Count;
                    idomain.Words          = dWords.Count;
                    idomain.TFIDFcompiled  = (dLemmas.Count > 0);
                    idomain.DistinctLemmas = dDistinctTerms.toCsvInLine();
                    idomain.AllLemmas      = dLemmas.toCsvInLine();
                    idomain.AllWords       = dWords.toCsvInLine();
                }
                idomain.InfoPrize = dIP;
                //if (doEvalD)


                var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls);

                idomain.relevantPages        = urlAssert[indexPageEvaluationEntryState.isRelevant].Count;
                idomain.notRelevantPages     = urlAssert[indexPageEvaluationEntryState.notRelevant].Count;
                idomain.detected             = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count;
                idomain.Crawled              = urlAssert.certainty;
                idomain.RelevantContentRatio = urlAssert.relevant;
                string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7"));
                if (loger != null)
                {
                    loger.AppendLine(rpp);
                }
            }



            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);
            }
            imbWEMManager.index.wRecordsDeployed++;

            __wRecord.Dispose();
        }