private void onTargetPageAttached(modelSpiderSiteRecord __wRecord, modelSpiderSiteRecordEventArgs __args) { imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = webSiteReposByDomain[__wRecord.domain]; ISpiderTarget target = __args.Target; if (mcRepo.isTargetProper(target)) { imbMCWebPage pRepo = mcRepo.BuildWebPage(target, wRepo, loger); pRepo.indexEntry = imbWEMManager.index.pageIndexTable.GetPageForUrl(target.url); pRepo.HtmlSourceCode = __args.sourceHtml; pRepo.XmlSourceCode = __args.sourceXml; pRepo.SaveDataStructure(wRepo.folder, loger); } }
/// <summary> /// Attaches the page - if the page was already attached returns <c>false</c> /// </summary> /// <param name="__page">The page.</param> /// <returns></returns> public bool AttachPage(spiderPage __page, ILogBuilder response, int targetBlockCount = 3) { if (page != __page) { page = __page; HtmlDocument htmlDoc = GetHtmlDocument(); iterationLoaded = parent.wRecord.iteration; if (htmlDoc != null) { XPathNavigator xnav = htmlDoc.DocumentNode.CreateNavigator(); pageText = xnav.retriveText(imbWEMManager.settings.contentProcessor.textRetrieve); pageText = WebUtility.HtmlDecode(pageText); pageHash = md5.GetMd5Hash(pageText); if (parent.wRecord.tRecord.instance.settings.doEnableDLC_BlockTree) { contentTree = htmlDoc.buildTree(page.webpage.domain); // contentTree = new nodeTree(page.webpage.domain, htmlDoc); contentBlocks = contentTree.getBlocks(targetBlockCount); contentBlocks.CalculateScores(); } var ignoreTokens = parent.wRecord.domainInfo.domainWords; var preprocessedTokens = parent.wRecord.tRecord.evaluator.GetAllProperTokensSortedByFrequency(pageText); if (parent.wRecord.tRecord.instance.settings.doEnableDLC_TFIDF) { content = parent.dlTargetPageTokens.AddTable(key) as termDocument; content.expansion = parent.wRecord.tRecord.instance.settings.TermExpansionForContent; content.AddTokens(preprocessedTokens.ToList(), response); } bool evaluationOk = false; indexPageEvaluationEntryState pageState = indexPageEvaluationEntryState.haveNoEvaluationEntry; if (imbWEMManager.settings.indexEngine.doIndexFullTrustMode) { pageState = imbWEMManager.index.pageIndexTable.GetPageAssertion(url); } else { pageState = indexPageEvaluationEntryState.notInTheIndex; } if (pageState.HasFlag(indexPageEvaluationEntryState.haveEvaluationEntry)) { evaluation = new multiLanguageEvaluation(); evaluation.result_language = evaluatedLanguage; evaluationOk = pageState.HasFlag(indexPageEvaluationEntryState.isRelevant); evaluatedLanguage = basicLanguageEnum.serbian; } else { evaluation = parent.wRecord.tRecord.evaluator.evaluate(pageText, ignoreTokens, preprocessedTokens.ToList()); evaluatedLanguage = evaluation.result_language; } lock (RelevantPageLock) { if (IsRelevant) { parent.wRecord.context.targets.termSerbian.AddRange(preprocessedTokens); parent.wRecord.relevantPages.AddUnique(__page.url); parent.wRecord.tRecord.relevantPages.AddUnique(__page.url); } else { parent.wRecord.context.targets.termOther.AddRange(preprocessedTokens); } parent.wRecord.context.targets.termsAll.AddRange(preprocessedTokens); } // <----- calling event //targs.htmlDoc = htmlDoc; if (parent.wRecord.context.OnTargetPageAttached != null) { var targs = new modelSpiderSiteRecordEventArgs(this); parent.wRecord.context.OnTargetPageAttached(parent.wRecord, targs); } } return(true); } return(false); }