public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null) { if (idomain == null) { idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain); } if (ipage == null) { ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url)); } List <string> output = new List <string>(); FileInfo file = GetWordList_File(idomain, ipage); if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists) { output = file.FullName.openFileToList(true); return(output); } string cont = target.pageText.transliterate(); // cont = cont.imbHtmlDecode(); termDocument pageTF = null; if (evaluator == null) { evaluator = target.parent.wRecord.tRecord.evaluator; } multiLanguageEvaluation evaluation = evaluator.evaluate(cont); if (evaluation.result_language == basicLanguageEnum.serbian) { List <string> pt = new List <string>(); pt.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { pt.AddRange(evaluation.multiLanguageTokens); } pt.RemoveAll(x => !x.isCleanWord()); pt.RemoveAll(x => x.isSymbolicContentOnly()); var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt); output.AddRange(tkns); } if (imbWEMManager.settings.TFIDF.doSavePageWordlist) { output.saveContentOnFilePath(file.FullName); } return(output); }
//public webSitePageTFSet GetTFIDF_MasterConstruct() //{ // if (globalTFIDFSet == null) // { // globalTFIDFSet = new webSitePageTFSet(SessionID, "Temporary TF-IDF table for master table construction"); // } // return globalTFIDFSet; //} private List <string> GetDLCTerms_Heuristics(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator, indexDomain idomain) { List <string> allTerms = new List <string>(); List <string> DLCTerms = new List <string>(); FileInfo dlcWordList = GetWordList_File(idomain); if (dlcWordList.Exists && __useExisting) { DLCTerms = dlcWordList.FullName.openFileToList(true); return(DLCTerms); } var tLoaded = __wRecord.context.targets.GetLoaded(); int tc = tLoaded.Count; int ti = 0; int ts = 10; int c = 0; double tp = 0; foreach (spiderTarget target in tLoaded) { ti++; c++; tp = ti.GetRatio(tc); if (target.IsRelevant) { string cont = target.pageText.transliterate(); cont = WebUtility.HtmlDecode(cont); // cont = cont.imbHtmlDecode(); allTerms.AddRange(cont.getTokens(true, true, true, true, 4)); //, loger); } if (c > 10) { c = 0; aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "]", loger, false, 0); } } multiLanguageEvaluation evaluation = evaluator.evaluate(allTerms, null, null); DLCTerms.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { DLCTerms.AddRange(evaluation.multiLanguageTokens); } DLCTerms = semanticLexiconManager.lexiconCache.encodeTwins(DLCTerms); if (imbWEMManager.settings.TFIDF.doSaveDomainWordList) { if (__saveToCache) { DLCTerms.saveContentOnFilePath(dlcWordList.FullName); } } return(DLCTerms); }