public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { if (!__wRecord.tRecord.instance.settings.FRONTIER_doLinkHarvest) { indexDomain idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); List <indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName); var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault(); var spage = seedTarget?.page; if (spage != null) { loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url); } //FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain); foreach (indexPage p in pages) { link l = new link(p.url); // if (__wRecord.web.webActiveLinks.Contains()) __wRecord.context.processLink(l, spage, false); } } }
// public aceConcurrentDictionary<weightTableCompiled> domainTF_IDF { get; set; } = new aceConcurrentDictionary<weightTableCompiled>(); //protected public override void eventPluginInstalled() { experimentSessionEntry session = imbWEMManager.index.experimentEntry; aceLog.consoleControl.setAsOutput(loger, "TFIDF:" + session.SessionID); // globalTFIDFConstruct = session.GetTFIDF_MasterConstruct(); //new webSitePageTFSet(__spider.SessionID); // globalTFIDFCompiled = __session.GetTFIDF_Master(); // new webSiteLemmaTFSetObjectTable(__session.indexSubFolder.pathFor(experimentSessionEntry.PATH_CompiledFTIDF), true, __session.SessionID); // domainTF_IDF = new aceConcurrentDictionary<weightTableCompiled>(); //if (globalTFIDFCompiled.Count > 0) //{ // loger.log("TF-IDF compiled version found on: " + globalTFIDFCompiled.info.FullName); //} // domainAssertion = imbWEMManager.index.domainIndexTable.GetDomainIndexAssertion(null, true); evaluator = new multiLanguageEvaluator(basicLanguageEnum.english, basicLanguageEnum.serbian, basicLanguageEnum.serbianCyr); evaluator.testTokenLimit = 5000; evaluator.tokenLengthMin = 3; evaluator.validTokenTarget = 2500; }
public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { // imbWEMManager.index.domainIndexTable var state = __session.state; indexDomain idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName); List <indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName); /* * __session.state.crawler.settings.FRONTIER_doLinkHarvest = false; * __session.state.crawler.settings.FRONTIER_doLinkResolver = false; */ var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault(); //.webPages.items.Values.First(); var spage = seedTarget?.page; if (spage != null) { loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url); } FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain); if ((!dlcFile.Exists) || imbWEMManager.settings.TFIDF.doSchedulePagesWithDLCTable) { foreach (indexPage p in pages) { link l = new link(p.url); if (!p.url.Contains(__wRecord.domainInfo.domainRootName)) { loger.AppendLine(__wRecord.domain + " -X-> " + p.url + " Wrong link association?"); aceTerminalInput.doBeepViaConsole(1600, 200, 3); } __wRecord.context.processLink(l, spage, false); } loger.AppendLine(__wRecord.domain + " -> " + __wRecord.web.webActiveLinks.Count + " targets set for load"); } else { loger.AppendLine(__wRecord.domain + " -> DLC cache found: " + dlcFile.FullName); } }
public override void eventDLCFinished(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { if (settings.plugIn_indexDBUpdater_TFIDF_per_DLC) { FileInfo master_file = session.GetTFIDF_Master_File(); session.doDomainEvaluation(settings, loger, __wRecord, evaluator, new weightTableCompiled(master_file.FullName, true, session.SessionID)); } else { session.doDomainEvaluation(settings, loger, __wRecord, evaluator, session.GetTFIDF_Master(loger, true, false)); } loger.AppendLine("Last index save: " + imbWEMManager.index.lastIndexSave.ToShortTimeString() + " [" + imbWEMManager.index.wRecordsDeployed + " / " + settings.doIndexAutoSaveOnDLCs + " ] "); if (imbWEMManager.index.wRecordsDeployed >= settings.doIndexAutoSaveOnDLCs) { imbWEMManager.index.Save(); } }
/// <summary> /// Gets the session. /// </summary> /// <param name="sessionId">The session identifier.</param> /// <param name="crawlId">The crawl identifier.</param> /// <returns></returns> public indexPerformanceEntry StartSession(string crawlId, ICrawlJobContext state = null) { indexSessionEntry = indexSessionRecords.GetOrCreate(DateTime.Now.ToShortDateString() + "-" + DateTime.Now.ToShortTimeString()); indexSessionEntry.SessionID = experimentManager.SessionID; indexSessionEntry.CrawlID = crawlId; indexSessionEntry.IndexRepository = current_indexID; indexSessionEntry.Start = DateTime.Now; experimentEntry = experimentManager.StartSession(crawlId, indexSessionEntry, state); plugins = new indexPlugInCollection(experimentEntry); plugins.IsEnabled = true; domainIndexTable.deploySession(); //imbWEMManager.index.experimentManager.globalTFIDFSet.GetAggregateDataTable().saveObjectToXML(folder.pathFor("TFIFD_aggregate")); if (imbWEMManager.settings.indexEngine.doIndexPublishAndBackupOnOpenSession) { Publish(imbWEMManager.authorNotation, experimentEntry.indexSubFolder); } return(indexSessionEntry); }
//public override Enum[] INSTALL_POINTS //{ // get // { // throw new NotImplementedException(); // } //} public override void eventUniversal <indexDomain, indexPage>(indexMaintenanceStageEnum stage, experimentSessionEntry __parent, indexDomain __domain, indexPage __page) { }
public override void eventIteration(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { }
public void doCheckCriteria(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry) { if (!tCPU.SampleState.HasFlag(measureTrendSampleState.macroMean)) { pluginState = workloadPluginState.preparing; } if (plugin.plugin_settings.term_DLCFinished > 0) // <----- da li je aktiviran ovaj uslov { if (_machine.taskDone >= plugin.plugin_settings.term_DLCFinished) { terminate(_machine); } } if (pluginState == workloadPluginState.active) { if (_machine.taskDone >= DLCDoneForNext) { doStartNextGroup(_machine, tRecord, plugin, entry); stateUpdate(_machine, tRecord, plugin, entry); plugin.records.Save(getWritableFileMode.overwrite); } if (isSampleTail) { if (plugin.plugin_settings.term_JLCinTail) { terminate(_machine); } else { pluginState = workloadPluginState.sampleTail; } } if (isMemoryLimit) { if (plugin.plugin_settings.term_availableMemory > 0) { terminate(_machine); } else { pluginState = workloadPluginState.cooldown; cooldownIndex = plugin.plugin_settings.warmingUpTicks; } } } }
private void doStartNextGroup(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry) { doReadData(_machine); DLCDoneForNext = _machine.taskDone + plugin.plugin_settings.stepUp_DLCCount; pluginState = workloadPluginState.wormingUp; string msg = $"Measure group {measureGroup} completed -- DLCs done: {_machine.taskDone}"; plugin.loger.log(msg); comment = comment.add($"Group {measureGroup} done"); wormingUpIndex = plugin.plugin_settings.warmingUpTicks; if (wormingUpIndex == 0) { pluginState = workloadPluginState.active; } measureGroup = measureGroup + 1; _machine.maxThreads += plugin.plugin_settings.stepUp_step; }
private void doCheckFacts(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry) { if (mMemory < plugin.plugin_settings.term_availableMemory) { warningUpDate("Available RAM [" + mMemory.ToString("P2") + "] is below the termination limit [" + plugin.plugin_settings.term_availableMemory.ToString("P2") + "]", true, plugin); if (terminationWarning >= plugin.plugin_settings.term_warningCount) { isMemoryLimit = true; } else { } } else if (_machine.taskWaiting == 0) { warningUpDate("There is no DLCs waiting [" + _machine.taskWaiting + "] - no way to run DLCs up to TC_max [" + _machine.maxThreads + "]", true, plugin); if (terminationWarning >= plugin.plugin_settings.term_warningCount) { isSampleTail = true; } } else { if (terminationWarning > 0) { warningUpDate("All termination criteria clean", false, plugin); } } if (_machine.taskRunning > _machine.maxThreads) { plugin.loger.log($" Running {_machine.taskRunning} more then TC_max {_machine.maxThreads} - switching to cooldown"); cooldownIndex = plugin.plugin_settings.warmingUpTicks; pluginState = workloadPluginState.cooldown; } }
private void doPerform(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry) { switch (pluginState) { case workloadPluginState.active: break; case workloadPluginState.cooldown: if (cooldownIndex > 0) { comment = comment.add($"Coolingdown [{cooldownIndex}]"); cooldownIndex = cooldownIndex - (thisSampleID - lastSampleID); } else { comment = comment.add($"Cooldown finished"); pluginState = workloadPluginState.active; } break; case workloadPluginState.disabled: break; case workloadPluginState.none: break; case workloadPluginState.preparing: if (tCPU.SampleState.HasFlag(measureTrendSampleState.macroMean)) { pluginState = workloadPluginState.active; plugin.loger.log("Workload plugin ready"); } break; case workloadPluginState.sampleTail: break; case workloadPluginState.terminating: terminate(_machine); break; case workloadPluginState.wormingUp: if (wormingUpIndex > 0) { comment = comment.add($"WormingUp [{wormingUpIndex}]"); wormingUpIndex = wormingUpIndex - (thisSampleID - lastSampleID); } else { comment = comment.add($"WormingUp finished"); pluginState = workloadPluginState.active; } break; } }
public void stateUpdate(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry) { comment = ""; doReadData(_machine); // <------------------ DATA COLLECTION // <------------------ STATE DECISION doCheckFacts(_machine, tRecord, plugin, entry); doPerform(_machine, tRecord, plugin, entry); doCreateEntry(_machine, tRecord, plugin, entry); doCheckCriteria(_machine, tRecord, plugin, entry); // <------------------ PRINTING OUT ---------------------------- plugin.loger.AppendHorizontalLine(); if (pluginState != workloadPluginState.disabled) { string st_in = pluginState.ToString(); if (pluginState == workloadPluginState.active) { st_in = "_" + st_in + "_"; } plugin.loger.AppendLine(string.Format(STATUSLINE_ONE, st_in, lastEntry.RecordID.ToString("D3"), lastEntry.measureGroup, lastEntry.dlcMaximum, lastEntry.dlcRunning, lastEntry.dlcWaiting).toWidthExact(Console.BufferWidth - 11, "=")); } plugin.loger.AppendLine(tMemory.GetTrendInline() + " | " + tCPU.GetTrendInline() + " | " + tCPUm.GetTrendInline()); plugin.loger.AppendLine(tDataLoad.GetTrendInline() + " | " + tContentPages.GetTrendInline() + " | " + tIterations.GetTrendInline()); //plugin.loger.AppendLine("--- Info: " ); if (pluginState != workloadPluginState.disabled) { plugin.loger.AppendLine(string.Format(STATUSLINE_TWO, mMemory.ToString("P2"), lastEntry.terminationWarning.ToString("D3"), lastEntry.dlcDone, DLCDoneForNext, thisSampleID, lastSampleID).toWidthExact(Console.BufferWidth - 11, "=")); // plugin.loger.AppendLine(String.Format(STATUSLINE_TWO, mMemory.ToString("P2"), g).toWidthExact(Console.BufferWidth-11, "=")); } }
public string doCreateEntry(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry) { string recID = ""; if (_machine != null) { thisSampleID = _machine.dataLoadTaker.CountTakes(); } if (lastSampleID == -1) { lastSampleID = 0; } if (thisSampleID != lastSampleID) { RecordID++; // <------------------ RECORD CREATION recID = GetEntryID(RecordID, measureGroup); lastEntry = plugin.records.GetOrCreate(recID); lastEntry.RecordID = RecordID; lastEntry.pluginState = pluginState.ToString(); switch (pluginState) { case workloadPluginState.active: lastEntry.measureGroup = measureGroup; break; default: lastEntry.measureGroup = -1; break; } lastEntry.SetTestIDAndSignature(tRecord.instance, entry.state, tRecord); lastEntry.terminationWarning = terminationWarning; lastEntry.availableMemory = mMemory; lastEntry.ContentPages = tContentPages.MicroMean; lastEntry.cpuRateOfMachine = tCPUm.MicroMean; lastEntry.cpuRateOfProcess = tCPU.MicroMean; lastEntry.physicalMemory = tMemory.MicroMean; lastEntry.CrawlerIterations = tIterations.MicroMean; lastEntry.DataLoad = tDataLoad.MicroMean; lastEntry.dlcDone = _machine.taskDone; lastEntry.dlcRunning = _machine.taskRunning; lastEntry.dlcWaiting = _machine.taskWaiting; lastEntry.dlcMaximum = _machine.maxThreads; plugin.records.AddOrUpdate(lastEntry); } lastSampleID = thisSampleID; return(recID); }
public override void eventDLCFinished(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { weightTableCompiled DLC_TDF = __session.GetOrCreateTFIDF_DLC(__wRecord, loger, imbWEMManager.settings.TFIDF.doUseCachedDLCTables, imbWEMManager.settings.TFIDF.doSaveCacheOfDLCTables, evaluator); // domainTF_IDF.Add(__wRecord.domain, DLC_TDF); /* * * * indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain); * * List<indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domain); * * * loger.log("[" + idomain.domain + "] application of DLC TF-IDF"); * * * allterms = new List<string>(); * List<String> DLCTerms = new List<string>(); * ti = 0; * tc = pages.Count; * foreach (indexPage ipage in pages) * { * //if (ipage.relevancy == indexPageRelevancyEnum.isRelevant) * //{ * * spiderTarget tPage = __wRecord.context.targets.GetByURL(ipage.url); // tLoaded.FirstOrDefault(x => (x.key == __wRecord.context.targets.GetHash(ipage.url))); * * if (!selected.Contains(tPage)) * { * continue; * } * * if (tPage == null) * { * loger.log("-- page: " + ipage.url + " [not found in the crawler context of: " + idomain.url); * continue; * } * * // __wRecord.context.targets.GetByURL(ipage.url); * termDocument dPage = (termDocument)domainSet[tPage.pageHash]; * * * if (dPage == null) * { * continue; * } * * dPage.expansion = 0; * distinct = new List<string>(); * * * var wt = dPage.GetAllTerms(); * foreach (IWeightTableTerm t in wt) * { * if (dPage.GetBDFreq(t) == 1) * { * distinct.Add(t.nominalForm); * } * allterms.Add(t.nominalForm); * } * * ipage.DistinctLemmas = distinct.toCsvInLine(); * ipage.RelevantTerms = allterms.toCsvInLine(); * ipage.TFIDFcompiled = true; * * DLCTerms.AddRangeUnique(allterms); * * dPage.GetDataTableClean(ipage.HashCode).saveObjectToXML(__session.indexSubFolder.pathFor(GetCompbinedHash(idomain, ipage) + ".xml")); * * ti++; * Double tp = ti.GetRatio(tc); * aceLog.consoleControl.writeToConsole(tp.ToString("P2"), loger, false, 0); * * imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage); * //} * } * * loger.log("[" + idomain.domain + "] application of DLC TF-IDF (done)"); * * * loger.log("[" + idomain.domain + "] constructing DLC TF-IDF for Master TF-IDF (semantic compression)"); * * // ------------- * //var sparks = DLCTerms.getSparks(1, loger, false); * * webPageTF wTFIDF = globalTFIDFConstruct.AddTable(idomain.HashCode) as webPageTF; * * wTFIDF.AddPageTerms(allterms, 0, loger); * * //wTFIDF.AddTokens(DLCTerms, loger); * * String path = __session.indexSubFolder.pathFor(idomain.HashCode + ".xml").getWritableFile().FullName; * wTFIDF.GetDataTable("Lemma" + idomain.domain, null, false).saveObjectToXML(path); * * * * idomain.Lemmas = wTFIDF.Count(); * * imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain); * * loger.log("[" + idomain.domain + "] TF-IDF operations done"); */ }