Ejemplo n.º 1
0
        public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            if (!__wRecord.tRecord.instance.settings.FRONTIER_doLinkHarvest)
            {
                indexDomain      idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
                List <indexPage> pages   = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName);


                var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault();

                var spage = seedTarget?.page;

                if (spage != null)
                {
                    loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url);
                }


                //FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain);

                foreach (indexPage p in pages)
                {
                    link l = new link(p.url);
                    // if (__wRecord.web.webActiveLinks.Contains())
                    __wRecord.context.processLink(l, spage, false);
                }
            }
        }
        //    public aceConcurrentDictionary<weightTableCompiled> domainTF_IDF { get; set; } = new aceConcurrentDictionary<weightTableCompiled>();

        //protected



        public override void eventPluginInstalled()
        {
            experimentSessionEntry session = imbWEMManager.index.experimentEntry;

            aceLog.consoleControl.setAsOutput(loger, "TFIDF:" + session.SessionID);

            //   globalTFIDFConstruct = session.GetTFIDF_MasterConstruct(); //new webSitePageTFSet(__spider.SessionID);

            //   globalTFIDFCompiled = __session.GetTFIDF_Master(); // new webSiteLemmaTFSetObjectTable(__session.indexSubFolder.pathFor(experimentSessionEntry.PATH_CompiledFTIDF), true, __session.SessionID);


            // domainTF_IDF = new aceConcurrentDictionary<weightTableCompiled>();



            //if (globalTFIDFCompiled.Count > 0)
            //{
            //    loger.log("TF-IDF compiled version found on: " + globalTFIDFCompiled.info.FullName);
            //}
            // domainAssertion =  imbWEMManager.index.domainIndexTable.GetDomainIndexAssertion(null, true);

            evaluator = new multiLanguageEvaluator(basicLanguageEnum.english, basicLanguageEnum.serbian, basicLanguageEnum.serbianCyr);
            evaluator.testTokenLimit   = 5000;
            evaluator.tokenLengthMin   = 3;
            evaluator.validTokenTarget = 2500;
        }
        public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            // imbWEMManager.index.domainIndexTable
            var state = __session.state;


            indexDomain      idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
            List <indexPage> pages   = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName);

            /*
             * __session.state.crawler.settings.FRONTIER_doLinkHarvest = false;
             * __session.state.crawler.settings.FRONTIER_doLinkResolver = false;
             */

            var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault();
            //.webPages.items.Values.First();
            var spage = seedTarget?.page;

            if (spage != null)
            {
                loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url);
            }


            FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain);

            if ((!dlcFile.Exists) || imbWEMManager.settings.TFIDF.doSchedulePagesWithDLCTable)
            {
                foreach (indexPage p in pages)
                {
                    link l = new link(p.url);

                    if (!p.url.Contains(__wRecord.domainInfo.domainRootName))
                    {
                        loger.AppendLine(__wRecord.domain + " -X-> " + p.url + " Wrong link association?");
                        aceTerminalInput.doBeepViaConsole(1600, 200, 3);
                    }

                    __wRecord.context.processLink(l, spage, false);
                }

                loger.AppendLine(__wRecord.domain + " -> " + __wRecord.web.webActiveLinks.Count + " targets set for load");
            }
            else
            {
                loger.AppendLine(__wRecord.domain + " -> DLC cache found: " + dlcFile.FullName);
            }
        }
Ejemplo n.º 4
0
        public override void eventDLCFinished(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            if (settings.plugIn_indexDBUpdater_TFIDF_per_DLC)
            {
                FileInfo master_file = session.GetTFIDF_Master_File();

                session.doDomainEvaluation(settings, loger, __wRecord, evaluator, new weightTableCompiled(master_file.FullName, true, session.SessionID));
            }
            else
            {
                session.doDomainEvaluation(settings, loger, __wRecord, evaluator, session.GetTFIDF_Master(loger, true, false));
            }

            loger.AppendLine("Last index save: " + imbWEMManager.index.lastIndexSave.ToShortTimeString()
                             + " [" + imbWEMManager.index.wRecordsDeployed + " / " + settings.doIndexAutoSaveOnDLCs + " ] ");

            if (imbWEMManager.index.wRecordsDeployed >= settings.doIndexAutoSaveOnDLCs)
            {
                imbWEMManager.index.Save();
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Gets the session.
        /// </summary>
        /// <param name="sessionId">The session identifier.</param>
        /// <param name="crawlId">The crawl identifier.</param>
        /// <returns></returns>
        public indexPerformanceEntry StartSession(string crawlId, ICrawlJobContext state = null)
        {
            indexSessionEntry                 = indexSessionRecords.GetOrCreate(DateTime.Now.ToShortDateString() + "-" + DateTime.Now.ToShortTimeString());
            indexSessionEntry.SessionID       = experimentManager.SessionID;
            indexSessionEntry.CrawlID         = crawlId;
            indexSessionEntry.IndexRepository = current_indexID;
            indexSessionEntry.Start           = DateTime.Now;


            experimentEntry   = experimentManager.StartSession(crawlId, indexSessionEntry, state);
            plugins           = new indexPlugInCollection(experimentEntry);
            plugins.IsEnabled = true;
            domainIndexTable.deploySession();
            //imbWEMManager.index.experimentManager.globalTFIDFSet.GetAggregateDataTable().saveObjectToXML(folder.pathFor("TFIFD_aggregate"));

            if (imbWEMManager.settings.indexEngine.doIndexPublishAndBackupOnOpenSession)
            {
                Publish(imbWEMManager.authorNotation, experimentEntry.indexSubFolder);
            }

            return(indexSessionEntry);
        }
Ejemplo n.º 6
0
        //public override Enum[] INSTALL_POINTS
        //{
        //    get
        //    {
        //        throw new NotImplementedException();
        //    }
        //}

        public override void eventUniversal <indexDomain, indexPage>(indexMaintenanceStageEnum stage, experimentSessionEntry __parent, indexDomain __domain, indexPage __page)
        {
        }
Ejemplo n.º 7
0
 public override void eventIteration(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
 }
Ejemplo n.º 8
0
        public void doCheckCriteria(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry)
        {
            if (!tCPU.SampleState.HasFlag(measureTrendSampleState.macroMean))
            {
                pluginState = workloadPluginState.preparing;
            }

            if (plugin.plugin_settings.term_DLCFinished > 0) // <----- da li je aktiviran ovaj uslov
            {
                if (_machine.taskDone >= plugin.plugin_settings.term_DLCFinished)
                {
                    terminate(_machine);
                }
            }



            if (pluginState == workloadPluginState.active)
            {
                if (_machine.taskDone >= DLCDoneForNext)
                {
                    doStartNextGroup(_machine, tRecord, plugin, entry);

                    stateUpdate(_machine, tRecord, plugin, entry);

                    plugin.records.Save(getWritableFileMode.overwrite);
                }

                if (isSampleTail)
                {
                    if (plugin.plugin_settings.term_JLCinTail)
                    {
                        terminate(_machine);
                    }
                    else
                    {
                        pluginState = workloadPluginState.sampleTail;
                    }
                }

                if (isMemoryLimit)
                {
                    if (plugin.plugin_settings.term_availableMemory > 0)
                    {
                        terminate(_machine);
                    }
                    else
                    {
                        pluginState = workloadPluginState.cooldown;

                        cooldownIndex = plugin.plugin_settings.warmingUpTicks;
                    }
                }
            }
        }
Ejemplo n.º 9
0
        private void doStartNextGroup(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry)
        {
            doReadData(_machine);

            DLCDoneForNext = _machine.taskDone + plugin.plugin_settings.stepUp_DLCCount;
            pluginState    = workloadPluginState.wormingUp;
            string msg = $"Measure group {measureGroup} completed -- DLCs done: {_machine.taskDone}";

            plugin.loger.log(msg);

            comment        = comment.add($"Group {measureGroup} done");
            wormingUpIndex = plugin.plugin_settings.warmingUpTicks;
            if (wormingUpIndex == 0)
            {
                pluginState = workloadPluginState.active;
            }
            measureGroup         = measureGroup + 1;
            _machine.maxThreads += plugin.plugin_settings.stepUp_step;
        }
Ejemplo n.º 10
0
        private void doCheckFacts(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry)
        {
            if (mMemory < plugin.plugin_settings.term_availableMemory)
            {
                warningUpDate("Available RAM [" + mMemory.ToString("P2") + "] is below the termination limit [" + plugin.plugin_settings.term_availableMemory.ToString("P2") + "]", true, plugin);
                if (terminationWarning >= plugin.plugin_settings.term_warningCount)
                {
                    isMemoryLimit = true;
                }
                else
                {
                }
            }
            else if (_machine.taskWaiting == 0)
            {
                warningUpDate("There is no DLCs waiting [" + _machine.taskWaiting + "] - no way to run DLCs up to TC_max [" + _machine.maxThreads + "]", true, plugin);
                if (terminationWarning >= plugin.plugin_settings.term_warningCount)
                {
                    isSampleTail = true;
                }
            }
            else
            {
                if (terminationWarning > 0)
                {
                    warningUpDate("All termination criteria clean", false, plugin);
                }
            }

            if (_machine.taskRunning > _machine.maxThreads)
            {
                plugin.loger.log($" Running {_machine.taskRunning} more then TC_max {_machine.maxThreads} - switching to cooldown");
                cooldownIndex = plugin.plugin_settings.warmingUpTicks;
                pluginState   = workloadPluginState.cooldown;
            }
        }
Ejemplo n.º 11
0
        private void doPerform(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry)
        {
            switch (pluginState)
            {
            case workloadPluginState.active:



                break;

            case workloadPluginState.cooldown:
                if (cooldownIndex > 0)
                {
                    comment       = comment.add($"Coolingdown [{cooldownIndex}]");
                    cooldownIndex = cooldownIndex - (thisSampleID - lastSampleID);
                }
                else
                {
                    comment     = comment.add($"Cooldown finished");
                    pluginState = workloadPluginState.active;
                }
                break;

            case workloadPluginState.disabled:
                break;

            case workloadPluginState.none:
                break;

            case workloadPluginState.preparing:
                if (tCPU.SampleState.HasFlag(measureTrendSampleState.macroMean))
                {
                    pluginState = workloadPluginState.active;
                    plugin.loger.log("Workload plugin ready");
                }
                break;

            case workloadPluginState.sampleTail:
                break;

            case workloadPluginState.terminating:

                terminate(_machine);

                break;

            case workloadPluginState.wormingUp:
                if (wormingUpIndex > 0)
                {
                    comment        = comment.add($"WormingUp [{wormingUpIndex}]");
                    wormingUpIndex = wormingUpIndex - (thisSampleID - lastSampleID);
                }
                else
                {
                    comment     = comment.add($"WormingUp finished");
                    pluginState = workloadPluginState.active;
                }
                break;
            }
        }
Ejemplo n.º 12
0
        public void stateUpdate(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry)
        {
            comment = "";

            doReadData(_machine);

            // <------------------ DATA COLLECTION



            // <------------------ STATE DECISION
            doCheckFacts(_machine, tRecord, plugin, entry);

            doPerform(_machine, tRecord, plugin, entry);

            doCreateEntry(_machine, tRecord, plugin, entry);



            doCheckCriteria(_machine, tRecord, plugin, entry);



            // <------------------ PRINTING OUT ----------------------------

            plugin.loger.AppendHorizontalLine();

            if (pluginState != workloadPluginState.disabled)
            {
                string st_in = pluginState.ToString();
                if (pluginState == workloadPluginState.active)
                {
                    st_in = "_" + st_in + "_";
                }
                plugin.loger.AppendLine(string.Format(STATUSLINE_ONE, st_in, lastEntry.RecordID.ToString("D3"), lastEntry.measureGroup, lastEntry.dlcMaximum, lastEntry.dlcRunning, lastEntry.dlcWaiting).toWidthExact(Console.BufferWidth - 11, "="));
            }

            plugin.loger.AppendLine(tMemory.GetTrendInline() + " | " + tCPU.GetTrendInline() + " | " + tCPUm.GetTrendInline());
            plugin.loger.AppendLine(tDataLoad.GetTrendInline() + " | " + tContentPages.GetTrendInline() + " | " + tIterations.GetTrendInline());

            //plugin.loger.AppendLine("--- Info: " );
            if (pluginState != workloadPluginState.disabled)
            {
                plugin.loger.AppendLine(string.Format(STATUSLINE_TWO, mMemory.ToString("P2"),
                                                      lastEntry.terminationWarning.ToString("D3"), lastEntry.dlcDone, DLCDoneForNext, thisSampleID, lastSampleID).toWidthExact(Console.BufferWidth - 11, "="));
                //  plugin.loger.AppendLine(String.Format(STATUSLINE_TWO, mMemory.ToString("P2"), g).toWidthExact(Console.BufferWidth-11, "="));
            }
        }
Ejemplo n.º 13
0
        public string doCreateEntry(crawlerDomainTaskMachine _machine, modelSpiderTestRecord tRecord, reportPlugIn_workload plugin, experimentSessionEntry entry)
        {
            string recID = "";

            if (_machine != null)
            {
                thisSampleID = _machine.dataLoadTaker.CountTakes();
            }
            if (lastSampleID == -1)
            {
                lastSampleID = 0;
            }

            if (thisSampleID != lastSampleID)
            {
                RecordID++;

                // <------------------ RECORD CREATION
                recID                 = GetEntryID(RecordID, measureGroup);
                lastEntry             = plugin.records.GetOrCreate(recID);
                lastEntry.RecordID    = RecordID;
                lastEntry.pluginState = pluginState.ToString();

                switch (pluginState)
                {
                case workloadPluginState.active:
                    lastEntry.measureGroup = measureGroup;
                    break;

                default:
                    lastEntry.measureGroup = -1;
                    break;
                }

                lastEntry.SetTestIDAndSignature(tRecord.instance, entry.state, tRecord);

                lastEntry.terminationWarning = terminationWarning;
                lastEntry.availableMemory    = mMemory;

                lastEntry.ContentPages     = tContentPages.MicroMean;
                lastEntry.cpuRateOfMachine = tCPUm.MicroMean;
                lastEntry.cpuRateOfProcess = tCPU.MicroMean;
                lastEntry.physicalMemory   = tMemory.MicroMean;


                lastEntry.CrawlerIterations = tIterations.MicroMean;
                lastEntry.DataLoad          = tDataLoad.MicroMean;

                lastEntry.dlcDone    = _machine.taskDone;
                lastEntry.dlcRunning = _machine.taskRunning;
                lastEntry.dlcWaiting = _machine.taskWaiting;
                lastEntry.dlcMaximum = _machine.maxThreads;

                plugin.records.AddOrUpdate(lastEntry);
            }
            lastSampleID = thisSampleID;
            return(recID);
        }
        public override void eventDLCFinished(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            weightTableCompiled DLC_TDF = __session.GetOrCreateTFIDF_DLC(__wRecord, loger, imbWEMManager.settings.TFIDF.doUseCachedDLCTables, imbWEMManager.settings.TFIDF.doSaveCacheOfDLCTables, evaluator);

            //   domainTF_IDF.Add(__wRecord.domain, DLC_TDF);

            /*
             *
             *
             * indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain);
             *
             * List<indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domain);
             *
             *
             * loger.log("[" + idomain.domain + "] application of DLC TF-IDF");
             *
             *
             * allterms = new List<string>();
             * List<String> DLCTerms = new List<string>();
             * ti = 0;
             * tc = pages.Count;
             * foreach (indexPage ipage in pages)
             * {
             *  //if (ipage.relevancy == indexPageRelevancyEnum.isRelevant)
             *  //{
             *
             *      spiderTarget tPage = __wRecord.context.targets.GetByURL(ipage.url); // tLoaded.FirstOrDefault(x => (x.key == __wRecord.context.targets.GetHash(ipage.url)));
             *
             *      if (!selected.Contains(tPage))
             *      {
             *          continue;
             *      }
             *
             *      if (tPage == null)
             *      {
             *          loger.log("-- page: " + ipage.url + " [not found in the crawler context of: " + idomain.url);
             *          continue;
             *      }
             *
             *      // __wRecord.context.targets.GetByURL(ipage.url);
             *      termDocument dPage = (termDocument)domainSet[tPage.pageHash];
             *
             *
             *      if (dPage == null)
             *      {
             *          continue;
             *      }
             *
             *      dPage.expansion = 0;
             *      distinct = new List<string>();
             *
             *
             *      var wt = dPage.GetAllTerms();
             *      foreach (IWeightTableTerm t in wt)
             *      {
             *          if (dPage.GetBDFreq(t) == 1)
             *          {
             *              distinct.Add(t.nominalForm);
             *          }
             *          allterms.Add(t.nominalForm);
             *      }
             *
             *      ipage.DistinctLemmas = distinct.toCsvInLine();
             *      ipage.RelevantTerms = allterms.toCsvInLine();
             *      ipage.TFIDFcompiled = true;
             *
             *      DLCTerms.AddRangeUnique(allterms);
             *
             *      dPage.GetDataTableClean(ipage.HashCode).saveObjectToXML(__session.indexSubFolder.pathFor(GetCompbinedHash(idomain, ipage) + ".xml"));
             *
             *      ti++;
             *      Double tp = ti.GetRatio(tc);
             *      aceLog.consoleControl.writeToConsole(tp.ToString("P2"), loger, false, 0);
             *
             *      imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage);
             *  //}
             * }
             *
             * loger.log("[" + idomain.domain + "] application of DLC TF-IDF (done)");
             *
             *
             * loger.log("[" + idomain.domain + "] constructing DLC TF-IDF for Master TF-IDF (semantic compression)");
             *
             * // -------------
             * //var sparks = DLCTerms.getSparks(1, loger, false);
             *
             * webPageTF wTFIDF = globalTFIDFConstruct.AddTable(idomain.HashCode) as webPageTF;
             *
             * wTFIDF.AddPageTerms(allterms, 0, loger);
             *
             * //wTFIDF.AddTokens(DLCTerms, loger);
             *
             * String path = __session.indexSubFolder.pathFor(idomain.HashCode + ".xml").getWritableFile().FullName;
             * wTFIDF.GetDataTable("Lemma" + idomain.domain, null, false).saveObjectToXML(path);
             *
             *
             *
             * idomain.Lemmas = wTFIDF.Count();
             *
             * imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);
             *
             * loger.log("[" + idomain.domain + "] TF-IDF operations done");
             */
        }