public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null) { if (idomain == null) { idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain); } if (ipage == null) { ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url)); } List <string> output = new List <string>(); FileInfo file = GetWordList_File(idomain, ipage); if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists) { output = file.FullName.openFileToList(true); return(output); } string cont = target.pageText.transliterate(); // cont = cont.imbHtmlDecode(); termDocument pageTF = null; if (evaluator == null) { evaluator = target.parent.wRecord.tRecord.evaluator; } multiLanguageEvaluation evaluation = evaluator.evaluate(cont); if (evaluation.result_language == basicLanguageEnum.serbian) { List <string> pt = new List <string>(); pt.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { pt.AddRange(evaluation.multiLanguageTokens); } pt.RemoveAll(x => !x.isCleanWord()); pt.RemoveAll(x => x.isSymbolicContentOnly()); var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt); output.AddRange(tkns); } if (imbWEMManager.settings.TFIDF.doSavePageWordlist) { output.saveContentOnFilePath(file.FullName); } return(output); }
/// <summary> /// Attaches the page - if the page was already attached returns <c>false</c> /// </summary> /// <param name="__page">The page.</param> /// <returns></returns> public bool AttachPage(spiderPage __page, ILogBuilder response, int targetBlockCount = 3) { if (page != __page) { page = __page; HtmlDocument htmlDoc = GetHtmlDocument(); iterationLoaded = parent.wRecord.iteration; if (htmlDoc != null) { XPathNavigator xnav = htmlDoc.DocumentNode.CreateNavigator(); pageText = xnav.retriveText(imbWEMManager.settings.contentProcessor.textRetrieve); pageText = WebUtility.HtmlDecode(pageText); pageHash = md5.GetMd5Hash(pageText); if (parent.wRecord.tRecord.instance.settings.doEnableDLC_BlockTree) { contentTree = htmlDoc.buildTree(page.webpage.domain); // contentTree = new nodeTree(page.webpage.domain, htmlDoc); contentBlocks = contentTree.getBlocks(targetBlockCount); contentBlocks.CalculateScores(); } var ignoreTokens = parent.wRecord.domainInfo.domainWords; var preprocessedTokens = parent.wRecord.tRecord.evaluator.GetAllProperTokensSortedByFrequency(pageText); if (parent.wRecord.tRecord.instance.settings.doEnableDLC_TFIDF) { content = parent.dlTargetPageTokens.AddTable(key) as termDocument; content.expansion = parent.wRecord.tRecord.instance.settings.TermExpansionForContent; content.AddTokens(preprocessedTokens.ToList(), response); } bool evaluationOk = false; indexPageEvaluationEntryState pageState = indexPageEvaluationEntryState.haveNoEvaluationEntry; if (imbWEMManager.settings.indexEngine.doIndexFullTrustMode) { pageState = imbWEMManager.index.pageIndexTable.GetPageAssertion(url); } else { pageState = indexPageEvaluationEntryState.notInTheIndex; } if (pageState.HasFlag(indexPageEvaluationEntryState.haveEvaluationEntry)) { evaluation = new multiLanguageEvaluation(); evaluation.result_language = evaluatedLanguage; evaluationOk = pageState.HasFlag(indexPageEvaluationEntryState.isRelevant); evaluatedLanguage = basicLanguageEnum.serbian; } else { evaluation = parent.wRecord.tRecord.evaluator.evaluate(pageText, ignoreTokens, preprocessedTokens.ToList()); evaluatedLanguage = evaluation.result_language; } lock (RelevantPageLock) { if (IsRelevant) { parent.wRecord.context.targets.termSerbian.AddRange(preprocessedTokens); parent.wRecord.relevantPages.AddUnique(__page.url); parent.wRecord.tRecord.relevantPages.AddUnique(__page.url); } else { parent.wRecord.context.targets.termOther.AddRange(preprocessedTokens); } parent.wRecord.context.targets.termsAll.AddRange(preprocessedTokens); } // <----- calling event //targs.htmlDoc = htmlDoc; if (parent.wRecord.context.OnTargetPageAttached != null) { var targs = new modelSpiderSiteRecordEventArgs(this); parent.wRecord.context.OnTargetPageAttached(parent.wRecord, targs); } } return(true); } return(false); }
//public webSitePageTFSet GetTFIDF_MasterConstruct() //{ // if (globalTFIDFSet == null) // { // globalTFIDFSet = new webSitePageTFSet(SessionID, "Temporary TF-IDF table for master table construction"); // } // return globalTFIDFSet; //} private List <string> GetDLCTerms_Heuristics(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator, indexDomain idomain) { List <string> allTerms = new List <string>(); List <string> DLCTerms = new List <string>(); FileInfo dlcWordList = GetWordList_File(idomain); if (dlcWordList.Exists && __useExisting) { DLCTerms = dlcWordList.FullName.openFileToList(true); return(DLCTerms); } var tLoaded = __wRecord.context.targets.GetLoaded(); int tc = tLoaded.Count; int ti = 0; int ts = 10; int c = 0; double tp = 0; foreach (spiderTarget target in tLoaded) { ti++; c++; tp = ti.GetRatio(tc); if (target.IsRelevant) { string cont = target.pageText.transliterate(); cont = WebUtility.HtmlDecode(cont); // cont = cont.imbHtmlDecode(); allTerms.AddRange(cont.getTokens(true, true, true, true, 4)); //, loger); } if (c > 10) { c = 0; aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "]", loger, false, 0); } } multiLanguageEvaluation evaluation = evaluator.evaluate(allTerms, null, null); DLCTerms.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { DLCTerms.AddRange(evaluation.multiLanguageTokens); } DLCTerms = semanticLexiconManager.lexiconCache.encodeTwins(DLCTerms); if (imbWEMManager.settings.TFIDF.doSaveDomainWordList) { if (__saveToCache) { DLCTerms.saveContentOnFilePath(dlcWordList.FullName); } } return(DLCTerms); }