public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null)
        {
            if (idomain == null)
            {
                idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain);
            }
            if (ipage == null)
            {
                ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url));
            }

            List <string> output = new List <string>();

            FileInfo file = GetWordList_File(idomain, ipage);

            if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists)
            {
                output = file.FullName.openFileToList(true);



                return(output);
            }

            string cont = target.pageText.transliterate();
            // cont = cont.imbHtmlDecode();

            termDocument pageTF = null;

            if (evaluator == null)
            {
                evaluator = target.parent.wRecord.tRecord.evaluator;
            }

            multiLanguageEvaluation evaluation = evaluator.evaluate(cont);

            if (evaluation.result_language == basicLanguageEnum.serbian)
            {
                List <string> pt = new List <string>();

                pt.AddRange(evaluation.singleLanguageTokens);

                if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch)
                {
                    pt.AddRange(evaluation.multiLanguageTokens);
                }

                pt.RemoveAll(x => !x.isCleanWord());
                pt.RemoveAll(x => x.isSymbolicContentOnly());

                var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt);


                output.AddRange(tkns);
            }

            if (imbWEMManager.settings.TFIDF.doSavePageWordlist)
            {
                output.saveContentOnFilePath(file.FullName);
            }

            return(output);
        }
Beispiel #2
0
        /// <summary>
        /// Attaches the page - if the page was already attached returns <c>false</c>
        /// </summary>
        /// <param name="__page">The page.</param>
        /// <returns></returns>
        public bool AttachPage(spiderPage __page, ILogBuilder response, int targetBlockCount = 3)
        {
            if (page != __page)
            {
                page = __page;

                HtmlDocument htmlDoc = GetHtmlDocument();


                iterationLoaded = parent.wRecord.iteration;

                if (htmlDoc != null)
                {
                    XPathNavigator xnav = htmlDoc.DocumentNode.CreateNavigator();

                    pageText = xnav.retriveText(imbWEMManager.settings.contentProcessor.textRetrieve);

                    pageText = WebUtility.HtmlDecode(pageText);
                    pageHash = md5.GetMd5Hash(pageText);

                    if (parent.wRecord.tRecord.instance.settings.doEnableDLC_BlockTree)
                    {
                        contentTree   = htmlDoc.buildTree(page.webpage.domain); // contentTree = new nodeTree(page.webpage.domain, htmlDoc);
                        contentBlocks = contentTree.getBlocks(targetBlockCount);
                        contentBlocks.CalculateScores();
                    }


                    var ignoreTokens = parent.wRecord.domainInfo.domainWords;

                    var preprocessedTokens = parent.wRecord.tRecord.evaluator.GetAllProperTokensSortedByFrequency(pageText);


                    if (parent.wRecord.tRecord.instance.settings.doEnableDLC_TFIDF)
                    {
                        content           = parent.dlTargetPageTokens.AddTable(key) as termDocument;
                        content.expansion = parent.wRecord.tRecord.instance.settings.TermExpansionForContent;
                        content.AddTokens(preprocessedTokens.ToList(), response);
                    }



                    bool evaluationOk = false;

                    indexPageEvaluationEntryState pageState = indexPageEvaluationEntryState.haveNoEvaluationEntry;


                    if (imbWEMManager.settings.indexEngine.doIndexFullTrustMode)
                    {
                        pageState = imbWEMManager.index.pageIndexTable.GetPageAssertion(url);
                    }
                    else
                    {
                        pageState = indexPageEvaluationEntryState.notInTheIndex;
                    }

                    if (pageState.HasFlag(indexPageEvaluationEntryState.haveEvaluationEntry))
                    {
                        evaluation = new multiLanguageEvaluation();
                        evaluation.result_language = evaluatedLanguage;
                        evaluationOk      = pageState.HasFlag(indexPageEvaluationEntryState.isRelevant);
                        evaluatedLanguage = basicLanguageEnum.serbian;
                    }
                    else
                    {
                        evaluation        = parent.wRecord.tRecord.evaluator.evaluate(pageText, ignoreTokens, preprocessedTokens.ToList());
                        evaluatedLanguage = evaluation.result_language;
                    }



                    lock (RelevantPageLock)
                    {
                        if (IsRelevant)
                        {
                            parent.wRecord.context.targets.termSerbian.AddRange(preprocessedTokens);

                            parent.wRecord.relevantPages.AddUnique(__page.url);

                            parent.wRecord.tRecord.relevantPages.AddUnique(__page.url);
                        }

                        else
                        {
                            parent.wRecord.context.targets.termOther.AddRange(preprocessedTokens);
                        }

                        parent.wRecord.context.targets.termsAll.AddRange(preprocessedTokens);
                    }



                    // <----- calling event

                    //targs.htmlDoc = htmlDoc;
                    if (parent.wRecord.context.OnTargetPageAttached != null)
                    {
                        var targs = new modelSpiderSiteRecordEventArgs(this);

                        parent.wRecord.context.OnTargetPageAttached(parent.wRecord, targs);
                    }
                }

                return(true);
            }
            return(false);
        }
        //public webSitePageTFSet GetTFIDF_MasterConstruct()
        //{
        //    if (globalTFIDFSet == null)
        //    {
        //        globalTFIDFSet = new webSitePageTFSet(SessionID, "Temporary TF-IDF table for master table construction");

        //    }
        //    return globalTFIDFSet;
        //}

        private List <string> GetDLCTerms_Heuristics(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator, indexDomain idomain)
        {
            List <string> allTerms = new List <string>();

            List <string> DLCTerms = new List <string>();

            FileInfo dlcWordList = GetWordList_File(idomain);

            if (dlcWordList.Exists && __useExisting)
            {
                DLCTerms = dlcWordList.FullName.openFileToList(true);
                return(DLCTerms);
            }

            var    tLoaded = __wRecord.context.targets.GetLoaded();
            int    tc      = tLoaded.Count;
            int    ti      = 0;
            int    ts      = 10;
            int    c       = 0;
            double tp      = 0;

            foreach (spiderTarget target in tLoaded)
            {
                ti++;
                c++;
                tp = ti.GetRatio(tc);

                if (target.IsRelevant)
                {
                    string cont = target.pageText.transliterate();
                    cont = WebUtility.HtmlDecode(cont);
                    // cont = cont.imbHtmlDecode();

                    allTerms.AddRange(cont.getTokens(true, true, true, true, 4)); //, loger);
                }

                if (c > 10)
                {
                    c = 0;
                    aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "]", loger, false, 0);
                }
            }

            multiLanguageEvaluation evaluation = evaluator.evaluate(allTerms, null, null);

            DLCTerms.AddRange(evaluation.singleLanguageTokens);

            if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch)
            {
                DLCTerms.AddRange(evaluation.multiLanguageTokens);
            }

            DLCTerms = semanticLexiconManager.lexiconCache.encodeTwins(DLCTerms);

            if (imbWEMManager.settings.TFIDF.doSaveDomainWordList)
            {
                if (__saveToCache)
                {
                    DLCTerms.saveContentOnFilePath(dlcWordList.FullName);
                }
            }

            return(DLCTerms);
        }