public void SetTarget(modelSpiderSiteRecord wRecord, indexPage page) { link ln = new link(page.url); //spiderLink sLink = new spiderLink() wRecord.context.processLink(ln, wRecord.web.webPages.items.FirstOrDefault().Value, false); }
/// <summary> /// Gets <see cref="indexPage"/> entry from current Index database instance /// </summary> /// <returns></returns> public indexPage GetIndexPage() { if (indexPageInfo == null) { indexPageInfo = imbWEMManager.index.pageIndexTable.GetPageForUrl(url); } return(indexPageInfo); }
public indexPage deployTarget(spiderTarget target, modelSpiderSiteRecord wRecord, indexDomain idomain) { indexPage page = pageIndexTable.GetPageForUrl(target.url); //.GetOrCreate(md5.GetMd5Hash(target.url)); if (idomain == null) { idomain = domainIndexTable.GetDomain(wRecord.domainInfo.domainName); } page.url = target.url; page.tst = target.tokens.ToList().toCsvInLine(","); page.domain = wRecord.domainInfo.domainName; if (target.isLoaded) { if (target.evaluation != null) { if (target.evaluation.result_language != basicLanguageEnum.unknown) { page.langTestRatio = target.evaluation.result_ratio; page.singleMatchTokens = target.evaluation.singleLanguageTokens.toCsvInLine(","); page.multiMatchTokens = target.evaluation.multiLanguageTokens.toCsvInLine(","); page.wordCount = target.evaluation.allContentTokens.Count(); page.AllWords = target.evaluation.allContentTokens.toCsvInLine(); page.language = target.evaluation.result_language.ToString(); } } if (target.IsRelevant) { page.relevancy = indexPageRelevancyEnum.isRelevant; } else if (target.evaluatedLanguage == basicLanguageEnum.unknown) { page.relevancy = indexPageRelevancyEnum.unknown; } else { page.relevancy = indexPageRelevancyEnum.notRelevant; } page.byteSize = target.page.spiderResult.page.result.byteSize; } pageIndexTable.AddOrUpdate(page); return(page); }
/// <summary> /// Closes the session. /// </summary> public void CloseSession(IEnumerable <modelSpiderTestRecord> tRecords) { if (imbWEMManager.settings.indexEngine.doSaveFailedURLQueries) { pageIndexTable.ReadOnlyMode = false; int i = 0; int c = 0; int ic = pageIndexTable.urlsNotInIndex.Count; int ib = ic / 10; aceLog.log("Deploying queried URLs that were not in the index (" + ic.ToString() + ")"); foreach (string url in pageIndexTable.urlsNotInIndex) { i++; c++; indexPage page = pageIndexTable.GetPageForUrl(url); page.url = url; domainAnalysis da = new domainAnalysis(url); page.domain = da.domainName; pageIndexTable.AddOrUpdate(page); if (i >= ib) { aceLog.log("URL processed: " + c.GetRatio(ic).ToString("P2") + " (" + c + ")"); i = 0; } } } if (indexSessionEntry != null) { aceLog.log("Saving index engine performance : ... "); if (!SKIP_INDEXUPDATE) { var das = imbWEMManager.index.domainIndexTable.GetDomainIndexAssertion(null, true); aceLog.log("Saving index engine performance : DomainAssetion done "); indexSessionEntry.Domains = domainIndexTable.Count; indexSessionEntry.Pages = pageIndexTable.Count; indexSessionEntry.PagesEvaluated = pageIndexTable.Where(x => !collectionExtensions.isNullOrEmpty(x.relevancyText)).Count(); indexSessionEntry.CrawlerHash = experimentManager.CurrentSession.state.setupHash_crawler; indexSessionEntry.GlobalSetupHash = experimentManager.CurrentSession.state.setupHash_global; indexSessionEntry.Duration = DateTime.Now.Subtract(indexSessionEntry.Start).TotalMinutes; aceLog.log("Saving index engine performance : PagesEvaluated counted "); indexSessionEntry.CertainityPP = das.certainty; indexSessionEntry.MasterTFIDFCoverage = das.masterTFIDFApplied; indexSessionEntry.DomainTFIDFs = das[indexDomainContentEnum.completeDomainTFIDF].Count; } aceLog.log("Saving index engine performance : Saving index "); if (imbWEMManager.settings.directReportEngine.doPublishIndexPerformanceTable) { indexSessionRecords.AddOrUpdate(indexSessionEntry); } // experimentManager.globalTFIDFSet.GetAggregateDataTable().saveObjectToXML(folder.pathFor(experimentSessionRegistry.PATH_CompiledFTIDF)); } //Publish(imbWEMManager.authorNotation, null); //Publish(imbWEMManager.authorNotation, experimentManager.CurrentSession.sessionReportFolder); experimentManager.CloseSession(tRecords); }
public void addToPageSet(indexPage page) { pageSet.Add(page); }
/// <summary> /// Deploys information from wRecord, including the key /// </summary> /// <param name="wRecord">The w record.</param> public void deploy(modelSpiderSiteRecord wRecord) { double i_lm_harvest = 0; double i_lm_recall = 0; double i_pi_harvest = 0; double i_pi_nominal = 0; dataUnitSpiderIteration spi_first = wRecord.timeseries.GetData().FirstOrDefault() as dataUnitSpiderIteration; dataUnitSpiderIteration spi_last = wRecord.timeseries.lastEntry as dataUnitSpiderIteration; dataUnitSpiderIteration spi_current = wRecord.timeseries.currentEntry as dataUnitSpiderIteration; if (spi_current != null) { time_duration_s = creationTime.Subtract(spi_current.rowCreated).TotalSeconds; } else { time_duration_s = 0; } // if (spi_last != null) time_duration_gross_s = creationTime.Subtract(spi_last.rowCreated).TotalSeconds; else time_duration_gross_s = 0; if (spi_first != null) { time_sincefirst_s = creationTime.Subtract(spi_first.rowCreated).TotalSeconds; } else { time_sincefirst_s = 0; } indexDomain idomain = wRecord.GetIndexInfo(); // imbWEMManager.index.domainIndexTable.GetDomain(wRecord.domainInfo.domainName); iteration = wRecord.iteration; blocks_all = wRecord.context.targets.blocks.Count(false); blocks_relevant = wRecord.context.targets.blocks.Count(true); terms_all = wRecord.context.targets.termsAll.Count(); terms_relevant = wRecord.context.targets.termSerbian.Count(); var TFIDF = wRecord.MasterTFIDF; // imbWEMManager.index.experimentEntry.globalTFIDFCompiled; var mchs = TFIDF.GetMatches(wRecord.context.targets.termSerbian); //TFIDF.GetScoreAggregate() key = wRecord.domainInfo.domainName + iteration.ToString("D3"); int relCount = 0; int irelCount = 0; int lCount = 0; int rCount = 0; int dCount = 0; double fraDuration = 0; int modulesContained = 0; int rec = 0; foreach (frontierRankingAlgorithmIterationRecord gen in wRecord.frontierDLC.generalRecords) { rec++; fraDuration += gen.duration; } FRA_SummaryRuntime = fraDuration.GetRatio((double)rec); FRA_TimePercent = FRA_SummaryRuntime.GetRatio(time_duration_s); var rtake = wRecord.tRecord.measureTaker.GetLastTake(); if (rtake != null) { CPU = rtake.cpuRateOfProcess; } if (imbWEMManager.settings.directReportEngine.DR_ReportModules) { foreach (moduleDLCRecord mod in wRecord.frontierDLC) { if (mod != null) { modulesContained += mod.GetLastEntry().accumulated; } } } List <string> hashList = new List <string>(); List <spiderTarget> nonDuplicate = new List <spiderTarget>(); foreach (spiderTarget t in wRecord.context.targets.GetLoaded()) { indexPage ipage = t.GetIndexPage(); //imbWEMManager.index.pageIndexTable.GetPageForUrl(t.url); //i_pi_harvest += ipage.InfoPrize; if (ipage != null) { i_pi_nominal += ipage.InfoPrize; } bool isDuplicate = t.isDuplicate; if (isDuplicate) { if (!hashList.Contains(t.pageHash)) { hashList.Add(t.pageHash); isDuplicate = false; } } if (!isDuplicate) { if (t.IsRelevant) { relCount++; } else { irelCount++; } lCount++; nonDuplicate.Add(t); } else { dCount++; } } relevantPageCount = relCount; irrelevantPageCount = irelCount; loadedPageCount = lCount; duplicateCount = dCount; int mchs_c = 0; int id_lm_c = 0; if (idomain != null) { id_lm_c = idomain.Lemmas; } if (mchs != null) { mchs_c = mchs.Count(); } i_lm_harvest = mchs_c.GetRatio(loadedPageCount); IP = TFIDF.GetScoreForMatch(wRecord.context.targets.termSerbian); i_lm_recall = mchs_c.GetRatio(id_lm_c); if (i_lm_recall > 1) { i_lm_recall = 1; } if (idomain != null) { IP_recall = i_pi_nominal.GetRatio(idomain.InfoPrize).ClipToK(); } if (idomain != null) { Term_recall = wRecord.context.targets.termSerbian.Count().GetRatio(idomain.Words).ClipToK(); } Page_recall = relevantPageCount.GetRatio(wRecord.pageRecallTarget).ClipToK(); i_pi_nominal = i_pi_nominal.GetRatio(loadedPageCount); i_pi_harvest = IP.GetRatio((double)lCount); spiderTaskResult lastResult = null; foreach (spiderTaskResult r in wRecord.spiderTaskResults) { lastResult = r; rCount = rCount + r.Count; } realLoadsCount = rCount; if (lastResult != null) { targetUrl = ""; targetLanguage = ""; targetEvalRatio = ""; foreach (spiderTaskResultItem item in lastResult.items.Values) { targetUrl = targetUrl.add(item.target.url, ","); var t = wRecord.context.targets.GetByTarget(item.target); if (t != null) { if (t.evaluation != null) { targetLanguage = targetLanguage.add(t.evaluatedLanguage.ToString(), ";"); targetEvalRatio = targetEvalRatio.add(t.evaluation.result_ratio.ToString(), ";"); } else { if (t.isDuplicate) { targetLanguage = targetLanguage.add("duplicate", ";"); targetEvalRatio = targetEvalRatio.add("duplicate", ";"); } else { targetLanguage = targetLanguage.add("unknown", ";"); targetEvalRatio = targetEvalRatio.add("unknown", ";"); } } } } } if ((relevantPageCount == 0) || (loadedPageCount == 0)) { E_PP = 0; } else { E_PP = (double)relevantPageCount / (double)loadedPageCount; } if ((wRecord.context.targets.termSerbian.Count == 0) || (wRecord.context.targets.termsAll.Count == 0) || (loadedPageCount == 0)) { E_TP = 0; E_TH = 0; } else { E_TP = (double)wRecord.context.targets.termSerbian.Count / (double)wRecord.context.targets.termsAll.Count; E_TH = (double)wRecord.context.targets.termSerbian.Count / (double)loadedPageCount; } IPnominal = i_pi_nominal; IP_collected = i_pi_harvest; Lm_collected = i_lm_harvest; Lm_recall = i_lm_recall; }
/// <summary> /// Performs full domain reevaluation /// </summary> /// <param name="settings">The settings.</param> /// <param name="loger">The loger.</param> /// <param name="__wRecord">The w record.</param> /// <param name="evaluator">The evaluator.</param> public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF) { indexDomain idomain = null; //lock (updateIndexLockD) //{ idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); // } idomain.url = __wRecord.domain; //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true); double dIP = 0; int p = 0; List <string> dTerms = new List <string>(); List <string> dDistinctTerms = new List <string>(); List <string> dLemmas = new List <string>(); List <string> dWords = new List <string>(); List <string> urls = new List <string>(); bool doEvalD = true; foreach (spiderTarget target in __wRecord.context.targets.GetLoaded()) { indexPage ipage = null; // lock (updateIndexLock) // { ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain); // } bool doEval = true; int dLc = 0; if (settings.plugIn_indexDBUpdater_optimizedMode) { if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant))) { doEval = false; if (ipage.AllWords.isNullOrEmpty()) { doEval = true; } if (ipage.AllLemmas.isNullOrEmpty()) { doEval = true; } } } if (doEval) { List <string> terms = new List <string>(); if (ipage.AllWords.isNullOrEmpty()) { terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); } else { terms = ipage.AllWords.SplitSmart(",", "", true); } ipage.AllWords = terms.toCsvInLine(); double IP = 0; List <string> lemmas = new List <string>(); List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms); if (ipage.AllLemmas.isNullOrEmpty()) { // terms = GetTermsForPage(target, idomain, ipage, evaluator, loger); lemmas.AddRange(mchl.Select(x => x.nominalForm)); } else { lemmas = ipage.AllLemmas.SplitSmart(",", "", true); } foreach (weightTableTermCompiled cterm in mchl) { IP += cterm.tf_idf; //dTerms.AddUnique(cterm.nominalForm); if (cterm.df == 1) { dDistinctTerms.AddUnique(cterm.nominalForm); } } ipage.InfoPrize = IP; dIP += IP; ipage.Lemmas = lemmas.Count; ipage.AllLemmas = lemmas.toCsvInLine(); dWords.AddRange(terms); dLemmas.AddRange(lemmas); ipage.Note = "indexUpdate" + SessionID; // lock (updateIndexLockB) // { imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage); // } // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } else { dIP += ipage.InfoPrize; doEvalD = false; // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", " ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); } urls.Add(ipage.url); p++; loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5"))); target.Dispose(); } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { if (!doEvalD) { var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain); int dlc_c = dlc_tf.Count; idomain.TFIDFcompiled = (dlc_c > 0); idomain.Lemmas = dlc_c; } else { idomain.Lemmas = dLemmas.Count; idomain.Words = dWords.Count; idomain.TFIDFcompiled = (dLemmas.Count > 0); idomain.DistinctLemmas = dDistinctTerms.toCsvInLine(); idomain.AllLemmas = dLemmas.toCsvInLine(); idomain.AllWords = dWords.toCsvInLine(); } idomain.InfoPrize = dIP; //if (doEvalD) var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls); idomain.relevantPages = urlAssert[indexPageEvaluationEntryState.isRelevant].Count; idomain.notRelevantPages = urlAssert[indexPageEvaluationEntryState.notRelevant].Count; idomain.detected = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count; idomain.Crawled = urlAssert.certainty; idomain.RelevantContentRatio = urlAssert.relevant; string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7")); if (loger != null) { loger.AppendLine(rpp); } } if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry) { imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); } imbWEMManager.index.wRecordsDeployed++; __wRecord.Dispose(); }
public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null) { if (idomain == null) { idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain); } if (ipage == null) { ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url)); } List <string> output = new List <string>(); FileInfo file = GetWordList_File(idomain, ipage); if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists) { output = file.FullName.openFileToList(true); return(output); } string cont = target.pageText.transliterate(); // cont = cont.imbHtmlDecode(); termDocument pageTF = null; if (evaluator == null) { evaluator = target.parent.wRecord.tRecord.evaluator; } multiLanguageEvaluation evaluation = evaluator.evaluate(cont); if (evaluation.result_language == basicLanguageEnum.serbian) { List <string> pt = new List <string>(); pt.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { pt.AddRange(evaluation.multiLanguageTokens); } pt.RemoveAll(x => !x.isCleanWord()); pt.RemoveAll(x => x.isSymbolicContentOnly()); var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt); output.AddRange(tkns); } if (imbWEMManager.settings.TFIDF.doSavePageWordlist) { output.saveContentOnFilePath(file.FullName); } return(output); }
public FileInfo GetWordList_File(indexDomain idomain, indexPage ipage) { return(TFIDF_ConstructFolder.pathFor("p_words_" + idomain.HashCode + "-" + ipage.HashCode + ".txt").getWritableFile(getWritableFileMode.newOrExisting)); }