private void onTargetPageAttached(modelSpiderSiteRecord __wRecord, modelSpiderSiteRecordEventArgs __args)
        {
            imbMCRepository mcRepo = mcm.activeRepository;
            imbMCWebSite    wRepo  = webSiteReposByDomain[__wRecord.domain];

            ISpiderTarget target = __args.Target;

            if (mcRepo.isTargetProper(target))
            {
                imbMCWebPage pRepo = mcRepo.BuildWebPage(target, wRepo, loger);
                pRepo.indexEntry     = imbWEMManager.index.pageIndexTable.GetPageForUrl(target.url);
                pRepo.HtmlSourceCode = __args.sourceHtml;
                pRepo.XmlSourceCode  = __args.sourceXml;

                pRepo.SaveDataStructure(wRepo.folder, loger);
            }
        }
Beispiel #2
0
        /// <summary>
        /// Attaches the page - if the page was already attached returns <c>false</c>
        /// </summary>
        /// <param name="__page">The page.</param>
        /// <returns></returns>
        public bool AttachPage(spiderPage __page, ILogBuilder response, int targetBlockCount = 3)
        {
            if (page != __page)
            {
                page = __page;

                HtmlDocument htmlDoc = GetHtmlDocument();


                iterationLoaded = parent.wRecord.iteration;

                if (htmlDoc != null)
                {
                    XPathNavigator xnav = htmlDoc.DocumentNode.CreateNavigator();

                    pageText = xnav.retriveText(imbWEMManager.settings.contentProcessor.textRetrieve);

                    pageText = WebUtility.HtmlDecode(pageText);
                    pageHash = md5.GetMd5Hash(pageText);

                    if (parent.wRecord.tRecord.instance.settings.doEnableDLC_BlockTree)
                    {
                        contentTree   = htmlDoc.buildTree(page.webpage.domain); // contentTree = new nodeTree(page.webpage.domain, htmlDoc);
                        contentBlocks = contentTree.getBlocks(targetBlockCount);
                        contentBlocks.CalculateScores();
                    }


                    var ignoreTokens = parent.wRecord.domainInfo.domainWords;

                    var preprocessedTokens = parent.wRecord.tRecord.evaluator.GetAllProperTokensSortedByFrequency(pageText);


                    if (parent.wRecord.tRecord.instance.settings.doEnableDLC_TFIDF)
                    {
                        content           = parent.dlTargetPageTokens.AddTable(key) as termDocument;
                        content.expansion = parent.wRecord.tRecord.instance.settings.TermExpansionForContent;
                        content.AddTokens(preprocessedTokens.ToList(), response);
                    }



                    bool evaluationOk = false;

                    indexPageEvaluationEntryState pageState = indexPageEvaluationEntryState.haveNoEvaluationEntry;


                    if (imbWEMManager.settings.indexEngine.doIndexFullTrustMode)
                    {
                        pageState = imbWEMManager.index.pageIndexTable.GetPageAssertion(url);
                    }
                    else
                    {
                        pageState = indexPageEvaluationEntryState.notInTheIndex;
                    }

                    if (pageState.HasFlag(indexPageEvaluationEntryState.haveEvaluationEntry))
                    {
                        evaluation = new multiLanguageEvaluation();
                        evaluation.result_language = evaluatedLanguage;
                        evaluationOk      = pageState.HasFlag(indexPageEvaluationEntryState.isRelevant);
                        evaluatedLanguage = basicLanguageEnum.serbian;
                    }
                    else
                    {
                        evaluation        = parent.wRecord.tRecord.evaluator.evaluate(pageText, ignoreTokens, preprocessedTokens.ToList());
                        evaluatedLanguage = evaluation.result_language;
                    }



                    lock (RelevantPageLock)
                    {
                        if (IsRelevant)
                        {
                            parent.wRecord.context.targets.termSerbian.AddRange(preprocessedTokens);

                            parent.wRecord.relevantPages.AddUnique(__page.url);

                            parent.wRecord.tRecord.relevantPages.AddUnique(__page.url);
                        }

                        else
                        {
                            parent.wRecord.context.targets.termOther.AddRange(preprocessedTokens);
                        }

                        parent.wRecord.context.targets.termsAll.AddRange(preprocessedTokens);
                    }



                    // <----- calling event

                    //targs.htmlDoc = htmlDoc;
                    if (parent.wRecord.context.OnTargetPageAttached != null)
                    {
                        var targs = new modelSpiderSiteRecordEventArgs(this);

                        parent.wRecord.context.OnTargetPageAttached(parent.wRecord, targs);
                    }
                }

                return(true);
            }
            return(false);
        }