Ejemplo n.º 1
0
        public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null)
        {
            if (idomain == null)
            {
                idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain);
            }
            if (ipage == null)
            {
                ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url));
            }

            List <string> output = new List <string>();

            FileInfo file = GetWordList_File(idomain, ipage);

            if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists)
            {
                output = file.FullName.openFileToList(true);



                return(output);
            }

            string cont = target.pageText.transliterate();
            // cont = cont.imbHtmlDecode();

            termDocument pageTF = null;

            if (evaluator == null)
            {
                evaluator = target.parent.wRecord.tRecord.evaluator;
            }

            multiLanguageEvaluation evaluation = evaluator.evaluate(cont);

            if (evaluation.result_language == basicLanguageEnum.serbian)
            {
                List <string> pt = new List <string>();

                pt.AddRange(evaluation.singleLanguageTokens);

                if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch)
                {
                    pt.AddRange(evaluation.multiLanguageTokens);
                }

                pt.RemoveAll(x => !x.isCleanWord());
                pt.RemoveAll(x => x.isSymbolicContentOnly());

                var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt);


                output.AddRange(tkns);
            }

            if (imbWEMManager.settings.TFIDF.doSavePageWordlist)
            {
                output.saveContentOnFilePath(file.FullName);
            }

            return(output);
        }
Ejemplo n.º 2
0
        //public webSitePageTFSet GetTFIDF_MasterConstruct()
        //{
        //    if (globalTFIDFSet == null)
        //    {
        //        globalTFIDFSet = new webSitePageTFSet(SessionID, "Temporary TF-IDF table for master table construction");

        //    }
        //    return globalTFIDFSet;
        //}

        private List <string> GetDLCTerms_Heuristics(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator, indexDomain idomain)
        {
            List <string> allTerms = new List <string>();

            List <string> DLCTerms = new List <string>();

            FileInfo dlcWordList = GetWordList_File(idomain);

            if (dlcWordList.Exists && __useExisting)
            {
                DLCTerms = dlcWordList.FullName.openFileToList(true);
                return(DLCTerms);
            }

            var    tLoaded = __wRecord.context.targets.GetLoaded();
            int    tc      = tLoaded.Count;
            int    ti      = 0;
            int    ts      = 10;
            int    c       = 0;
            double tp      = 0;

            foreach (spiderTarget target in tLoaded)
            {
                ti++;
                c++;
                tp = ti.GetRatio(tc);

                if (target.IsRelevant)
                {
                    string cont = target.pageText.transliterate();
                    cont = WebUtility.HtmlDecode(cont);
                    // cont = cont.imbHtmlDecode();

                    allTerms.AddRange(cont.getTokens(true, true, true, true, 4)); //, loger);
                }

                if (c > 10)
                {
                    c = 0;
                    aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "]", loger, false, 0);
                }
            }

            multiLanguageEvaluation evaluation = evaluator.evaluate(allTerms, null, null);

            DLCTerms.AddRange(evaluation.singleLanguageTokens);

            if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch)
            {
                DLCTerms.AddRange(evaluation.multiLanguageTokens);
            }

            DLCTerms = semanticLexiconManager.lexiconCache.encodeTwins(DLCTerms);

            if (imbWEMManager.settings.TFIDF.doSaveDomainWordList)
            {
                if (__saveToCache)
                {
                    DLCTerms.saveContentOnFilePath(dlcWordList.FullName);
                }
            }

            return(DLCTerms);
        }