public void eventDLCFinished(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
     foreach (IPlugInCommonBase <indexMaintenanceStageEnum, experimentSessionEntry> plug in allPlugins)
     {
         plug.eventDLCFinished(__parent as experimentSessionEntry, __task, __wRecord); /// aJob, __machine, __tRecord);
     }
 }
示例#2
0
        public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            if (!__wRecord.tRecord.instance.settings.FRONTIER_doLinkHarvest)
            {
                indexDomain      idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
                List <indexPage> pages   = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName);


                var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault();

                var spage = seedTarget?.page;

                if (spage != null)
                {
                    loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url);
                }


                //FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain);

                foreach (indexPage p in pages)
                {
                    link l = new link(p.url);
                    // if (__wRecord.web.webActiveLinks.Contains())
                    __wRecord.context.processLink(l, spage, false);
                }
            }
        }
 public void eventDLCFinished(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
     foreach (IPlugInCommonBase <crawlerDomainTaskIterationPhase, spiderEvaluatorBase> plug in allPlugins)
     {
         plug.eventDLCFinished(__parent as spiderEvaluatorBase, __task, __wRecord); /// aJob, __machine, __tRecord);
     }
 }
示例#4
0
 public void eventDLCFinished(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
     foreach (IPlugInCommonBase <crawlReportingStageEnum, directReporterBase> plug in allPlugins)
     {
         plug.eventDLCFinished(__parent as directReporterBase, __task, __wRecord); /// aJob, __machine, __tRecord);
     }
 }
 public void eventIteration(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
     foreach (indexPlugIn_base plug in allPlugins)
     {
         plug.eventIteration(__parent as experimentSessionEntry, __task, __wRecord); /// aJob, __machine, __tRecord);
     }
 }
示例#6
0
 public void eventDLCInitiated(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
     foreach (IPlugInCommonBase <crawlReportingStageEnum, directReporterBase> plug in allPlugins)
     {
         plug.eventDLCInitiated(null, __task, __wRecord); /// aJob, __machine, __tRecord);
     }
 }
 public void eventDLCInitiated(object __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
     foreach (IPlugInCommonBase <crawlJobEngineStageEnum, crawlerDomainTaskMachine> plug in allPlugins)
     {
         plug.eventDLCInitiated(__parent as crawlerDomainTaskMachine, __task, __wRecord); /// aJob, __machine, __tRecord);
     }
 }
        public override void eventUniversal <TTask, TBase>(crawlJobEngineStageEnum stage, crawlerDomainTaskMachine __machine, TTask __task, TBase __spider)
        {
            if (!IsEnabled)
            {
                return;
            }

            if (plugins[stage].Any(x => x.IsEnabled))
            {
                crawlerDomainTask tsk = __task as crawlerDomainTask;

                foreach (IPlugInCommonBase <crawlJobEngineStageEnum, crawlerDomainTaskMachine> plug in plugins[stage])
                {
                    try
                    {
                        plug.eventUniversal(stage, __machine, __task, __spider);

                        //if (plug is ISpiderPlugInForContent) ((ISpiderPlugInForContent)plug).processAfterResultReceived(wRecord, wTask);
                    }
                    catch (Exception ex)
                    {
                        aceLog.log("Engine Plugin [" + plug.name + "]:" + plug.GetType().Name + " at " + stage.ToString() + " execution crashed: " + ex.Message);
                        crawlerErrorLog cel = new crawlerErrorLog(ex, null, tsk, crawlerErrorEnum.enginePlugin);

                        cel.SaveXML();
                    }
                }
            }
        }
        public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            imbMCRepository mcRepo = mcm.activeRepository;
            imbMCWebSite    wRepo  = webSiteReposByDomain[__wRecord.domain];

            mcRepo.siteTable.AddOrUpdate(wRepo.entry);
            wRepo.SaveDataStructure(mcRepo.folder, loger);
        }
示例#10
0
 public override void eventDLCInitiated(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
     if (imbWEMManager.settings.directReportEngine.doDomainReport)
     {
         string             dlc_config = imbWEMManager.index.experimentEntry.sessionCrawlerFolder["sites"].pathFor("dlc_config_" + __wRecord.domainInfo.domainRootName.getFilename(".txt"));
         builderForMarkdown builder    = new builderForMarkdown();
         spiderTools.Describe(__task.evaluator, builder);
         builder.ToString().saveStringToFile(dlc_config);
     }
 }
        public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            // imbWEMManager.index.domainIndexTable
            var state = __session.state;


            indexDomain      idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
            List <indexPage> pages   = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName);

            /*
             * __session.state.crawler.settings.FRONTIER_doLinkHarvest = false;
             * __session.state.crawler.settings.FRONTIER_doLinkResolver = false;
             */

            var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault();
            //.webPages.items.Values.First();
            var spage = seedTarget?.page;

            if (spage != null)
            {
                loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url);
            }


            FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain);

            if ((!dlcFile.Exists) || imbWEMManager.settings.TFIDF.doSchedulePagesWithDLCTable)
            {
                foreach (indexPage p in pages)
                {
                    link l = new link(p.url);

                    if (!p.url.Contains(__wRecord.domainInfo.domainRootName))
                    {
                        loger.AppendLine(__wRecord.domain + " -X-> " + p.url + " Wrong link association?");
                        aceTerminalInput.doBeepViaConsole(1600, 200, 3);
                    }

                    __wRecord.context.processLink(l, spage, false);
                }

                loger.AppendLine(__wRecord.domain + " -> " + __wRecord.web.webActiveLinks.Count + " targets set for load");
            }
            else
            {
                loger.AppendLine(__wRecord.domain + " -> DLC cache found: " + dlcFile.FullName);
            }
        }
示例#12
0
        public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            if (__task.status == crawlerDomainTaskStatusEnum.aborted)
            {
                return;
            }

            if (__wRecord.iterationTableRecord == null)
            {
                return;
            }
            if (__wRecord.iterationTableRecord.Count == 0)
            {
                return;
            }

            indexDomain iDomain = records.GetOrCreate(__wRecord.instanceID);

            iDomain.url    = __wRecord.domainInfo.urlProper;
            iDomain.domain = __wRecord.domain;

            var lastRec = __wRecord.iterationTableRecord.LastOrDefault();

            var firstRec = __wRecord.iterationTableRecord.FirstOrDefault();

            iDomain.relevantPages    = lastRec.relevantPageCount;
            iDomain.notRelevantPages = lastRec.irrelevantPageCount;
            iDomain.detected         = __wRecord.web.webActiveLinks.Count();
            iDomain.Words            = __wRecord.context.targets.termsAll.Count();
            iDomain.LandingLanguage  = firstRec.targetLanguage;
            iDomain.LandingRelevant  = firstRec.relevantPageCount > 0;

            records.AddOrUpdate(iDomain, objectTableUpdatePolicy.updateIfHigher);
            DLCCount++;

            if (DLCCount >= imbWEMManager.settings.supportEngine.reportPlugIn_sideIndexer_DLCToSave)
            {
                DLCCount = 0;
                SaveAll();
                output.log("Side Index save and publish triggered on [" + __task.parent.parent.taskDone + "] DLC completed");
            }
        }
示例#13
0
        public void eventIteration(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            if (!IsEnabled)
            {
                return;
            }

            foreach (reportPlugIn_base plug in allPlugins)
            {
                try
                {
                    plug.eventIteration(__spider, __task, __wRecord);
                }
                catch (Exception ex)
                {
                    aceLog.log("Reporting Plugin [" + plug.name + "]:" + plug.GetType().Name + " at status report execution crashed: " + ex.Message);
                    crawlerErrorLog cel = new crawlerErrorLog(ex, null, null, crawlerErrorEnum.indexPlugin);
                    cel.SaveXML();
                }
            }
        }
示例#14
0
        public override void eventDLCFinished(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            if (settings.plugIn_indexDBUpdater_TFIDF_per_DLC)
            {
                FileInfo master_file = session.GetTFIDF_Master_File();

                session.doDomainEvaluation(settings, loger, __wRecord, evaluator, new weightTableCompiled(master_file.FullName, true, session.SessionID));
            }
            else
            {
                session.doDomainEvaluation(settings, loger, __wRecord, evaluator, session.GetTFIDF_Master(loger, true, false));
            }

            loger.AppendLine("Last index save: " + imbWEMManager.index.lastIndexSave.ToShortTimeString()
                             + " [" + imbWEMManager.index.wRecordsDeployed + " / " + settings.doIndexAutoSaveOnDLCs + " ] ");

            if (imbWEMManager.index.wRecordsDeployed >= settings.doIndexAutoSaveOnDLCs)
            {
                imbWEMManager.index.Save();
            }
        }
        public override void eventUniversal(crawlReportingStageEnum stage, directReporterBase __parent, crawlerDomainTask __task, modelSpiderSiteRecord wRecord)
        {
            switch (stage)
            {
            case crawlReportingStageEnum.DLCPreinitiation:

                wRecord.context.OnTargetPageAttached += new modelSpiderSiteRecordEvent(onTargetPageAttached);

                imbMCRepository mcRepo = mcm.activeRepository;

                imbMCWebSite wRepo = mcRepo.GetWebSite(wRecord.domainInfo, true, loger);
                if (!webSiteReposByDomain.ContainsKey(wRecord.domain))
                {
                    webSiteReposByDomain.Add(wRecord.domain, wRepo);
                }
                else
                {
                    loger.log("DLC sent to CrawlToMC plugin second time: " + wRecord.domain);
                }

                mcRepo.siteTable.AddOrUpdate(wRepo.entry);

                wRepo.SaveDataStructure(mcRepo.folder, loger);
                break;
            }
        }
 /// <summary>
 /// Just when new DLC thread was prepared to run
 /// </summary>
 /// <param name="__spider">The spider.</param>
 /// <param name="__task">The task.</param>
 /// <param name="__wRecord">The w record.</param>
 public abstract void eventDLCInitiated(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
示例#17
0
 public override void eventIteration(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
 }
示例#18
0
 public abstract void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
示例#19
0
        internal crawlerErrorLog CreateAndSaveError(Exception ex, modelSpiderSiteRecord wRecord, crawlerDomainTask crawlerDomainTask, crawlerErrorEnum errorType)
        {
            crawlerErrorLog clog = crawlerErrorLog.CreateAndSave(ex, wRecord, crawlerDomainTask, errorType);

            clog.SaveXML(folder[DRFolderEnum.logs].pathFor("DLC_crash_" + wRecord.domainInfo.domainRootName.getFilename()));
            return(clog);
        }
示例#20
0
 public abstract void eventIteration(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
示例#21
0
        //  public abstract void eventPluginInstalled(directReporterBase __spider);


        public abstract void eventUniversal(crawlReportingStageEnum stage, directReporterBase __parent, crawlerDomainTask __task, modelSpiderSiteRecord wRecord);
示例#22
0
 public override void eventDLCInitiated <TParent>(TParent __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) => eventDLCInitiated(__parent as crawlerDomainTaskMachine, __task, __wRecord);
示例#23
0
 public abstract void eventDLCInitiated <TParent>(TParent __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
示例#24
0
 public void eventDLCInitiated(crawlerDomainTaskMachine __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
 }
示例#25
0
        public override void eventUniversal <TFirst, TSecond>(crawlJobEngineStageEnum stage, crawlerDomainTaskMachine __machine, TFirst __task, TSecond __resource)
        {
            switch (stage)
            {
            case crawlJobEngineStageEnum.statusReport:

                //var tMemory = __machine.measureTaker.GetTrend(trendMemory);
                //var tCPU = __machine.measureTaker.GetTrend(trendCPU);
                //var tDataLoad = __machine.measureTaker.GetTrend(trendDataLoad);

                //var tContentPages = __machine.dataLoadTaker.GetTrend(trendContentPages);
                //var tContentTerms = __machine.dataLoadTaker.GetTrend(trendContentTerms);
                //var tIterations = __machine.dataLoadTaker.GetTrend(trendIterations);

                //loger.AppendLine(String.Format(TREND_LINE, tMemory.GetTrendInline(), tCPU.GetTrendInline(), tDataLoad.GetTrendInline()));
                //loger.AppendLine(String.Format(TREND_LINE, tContentPages.GetTrendInline(), tContentTerms.GetTrendInline(), tIterations.GetTrendInline()));



                int    change            = 0;
                int    newTC             = __machine.maxThreads;
                double maxLatencyToLimit = 0;
                double maxLatency        = 0;
                bool   doBoost           = false;
                string domainThatLates   = "";
                string threadId          = "";
                Thread criticalThread    = null;
                double average           = CPUAverageLast;
                double avgChange         = average - CPUAverageLast;

                double            maxAge     = 0;
                crawlerDomainTask taskOldest = null;

                var tasks = __machine.task_running.ToList();

                foreach (Task task in tasks)
                {
                    crawlerDomainTask taskInRun = task.AsyncState as crawlerDomainTask;
                    double            since     = taskInRun.sinceLastIterationStart;
                    double            tage      = DateTime.Now.Subtract(taskInRun.startTime).TotalMinutes;
                    maxLatency = Math.Max(maxLatency, since);
                    if (maxAge <= tage)
                    {
                        maxAge     = tage;
                        taskOldest = taskInRun;
                    }
                    if (maxLatency <= since)
                    {
                        domainThatLates = taskInRun.wRecord.domain;
                        if (taskInRun?.executionThread != null)
                        {
                            threadId = taskInRun.executionThread.ManagedThreadId.ToString() + " [" + taskInRun.executionThread.Priority.ToString() + "]";
                        }
                        criticalThread = taskInRun.executionThread;
                    }
                }

                maxLatencyToLimit = maxLatency.GetRatio(__machine.TimeLimitForTask);
                double maxAgeLimit = maxAge.GetRatio(__machine._timeLimitForDLC);

                double totalAgeLimit = DateTime.Now.Subtract(__machine.startTime).TotalMinutes.GetRatio(__machine.TimeLimitForCompleteJob);


                loger.log("Max. latency:    [" + maxLatency.ToString("F2") + " min][" + maxLatencyToLimit.ToString("P2") + "] " + domainThatLates + " Thread: " + threadId);

                if (taskOldest != null)
                {
                    loger.log("Oldest DLC:      [" + maxAge.ToString("F2") + " min][" + maxAgeLimit.ToString("P2") + "] " + taskOldest.wRecord.domain + " Thread: " + taskOldest.executionThread.ManagedThreadId.ToString() + " [" + taskOldest.executionThread.Priority.ToString() + "]");
                }

                #region TIMEOUT PREVENTION -----------------------------------------
                if (imbWEMManager.settings.crawlerJobEngine.doTaskTimeOutPrevention)
                {
                    if (totalAgeLimit > 0.9)
                    {
                        bool newDisable = false;
                        foreach (Task task in tasks)
                        {
                            crawlerDomainTask t = task.AsyncState as crawlerDomainTask;
                            if (!t.isLoaderDisabled)
                            {
                                t.isLoaderDisabled = true;
                                newDisable         = true;
                                loger.log("Time Limit Critical: loader is disabled for: " + t.wRecord.domain + " due execution time limit for Thread: " + t.executionThread.ManagedThreadId.ToString());
                            }
                        }


                        if (newDisable)
                        {
                            aceTerminalInput.doBeepViaConsole(1200, 250, 5);
                        }
                    }

                    if (maxAgeLimit > 0.9)
                    {
                        if (!taskOldest.isLoaderDisabled)
                        {
                            taskOldest.isLoaderDisabled = true;
                            loger.consoleAltColorToggle();
                            loger.log("DLC Time Limit Critical: loader is disabled for: " + taskOldest.wRecord.domain + " due execution time limit for Thread: " + taskOldest.executionThread.ManagedThreadId.ToString());
                            loger.consoleAltColorToggle();
                            aceTerminalInput.doBeepViaConsole();
                        }
                    }


                    doBoost = false;

                    if (maxLatencyToLimit > 0.5)
                    {
                        if (criticalThread != null)
                        {
                            criticalThread.Priority = ThreadPriority.AboveNormal;
                        }
                        change = -2;
                    }
                    else if (maxLatencyToLimit > 0.70)
                    {
                        if (criticalThread != null)
                        {
                            criticalThread.Priority = ThreadPriority.Highest;
                        }
                        change = -4;
                    }
                    else if (maxLatencyToLimit > 0.90)
                    {
                        loger.log("Max. latency critical :: REDUCING TO SINGLE THREAD : ");

                        foreach (Task task in tasks)
                        {
                            crawlerDomainTask taskInRun = task.AsyncState as crawlerDomainTask;
                            if (taskInRun?.executionThread != null)
                            {
                                taskInRun.executionThread.Priority = ThreadPriority.BelowNormal;
                            }
                        }

                        if (criticalThread != null)
                        {
                            criticalThread.Priority = ThreadPriority.Highest;
                        }

                        newTC = 1;
                    }
                    else
                    {
                        foreach (Task task in tasks)
                        {
                            crawlerDomainTask taskInRun = task.AsyncState as crawlerDomainTask;
                            if (taskOldest == taskInRun)
                            {
                                if (taskInRun?.executionThread != null)
                                {
                                    taskInRun.executionThread.Priority = ThreadPriority.AboveNormal;
                                }
                            }
                            else
                            {
                                if (taskInRun?.executionThread != null)
                                {
                                    taskInRun.executionThread.Priority = ThreadPriority.Normal;
                                }
                            }
                        }

                        doBoost = true;
                    }
                }

                if (imbWEMManager.settings.crawlerJobEngine.doAutoAdjustTC)
                {
                    #endregion --------------------------- ^ timeout prevention ^^

                    if (doBoost)     // <------ TC adjust
                    {
                        var takes = __machine.cpuTaker.GetLastSamples(imbWEMManager.settings.crawlerJobEngine.CPUSampleForAutoAdjustMax);


                        if (takes.Count < imbWEMManager.settings.crawlerJobEngine.CPUSampleForAutoAdjust)
                        {
                            return;
                        }

                        average = (takes.Average(x => x.reading) / 100);

                        avgChange = average - CPUAverageLast;

                        double CPUMargin = imbWEMManager.settings.crawlerJobEngine.CPUMargin;
                        int    dlc       = __machine.taskRunning;
                        CPUAverageDefendLine = Math.Max(average, CPUAverageLast);

                        if (dlc < (__machine.maxThreads - 1))
                        {
                            return;
                        }

                        if (average < imbWEMManager.settings.crawlerJobEngine.CPUTarget)
                        {
                            if (average < (CPUAverageDefendLine - CPUMargin))
                            {
                                change = -1;
                            }
                            else
                            {
                                change = 1;
                            }
                        }
                        else if (average > imbWEMManager.settings.crawlerJobEngine.CPULimit)
                        {
                            change = -1;
                        }

                        newTC = Math.Min(__machine.maxThreads + change, imbWEMManager.settings.crawlerJobEngine.TCAutoLimit);
                        if (newTC < 0)
                        {
                            newTC = 1;
                        }

                        CPUAverageLast = average;
                    }
                    else
                    {
                        if (change != 0)
                        {
                            newTC = Math.Min(__machine.maxThreads + change, imbWEMManager.settings.crawlerJobEngine.TCAutoLimit);
                        }
                        if (newTC < 0)
                        {
                            newTC = 1;
                        }
                    }
                }

                int e_change = newTC - __machine.maxThreads;
                __machine.maxThreads = newTC;


                loger.log("CPU average [" + average.ToString("P2") + "][" + avgChange.ToString("P2") + "] - (change: " + e_change + ") TC: " + __machine.maxThreads.ToString("D3") + " DLC:[" + __machine.taskRunning.ToString("D3") + "]");



                /*
                 * if (average < imbWEMManager.settings.crawlerJobEngine.CPUTarget)
                 * {
                 *
                 *  ;
                 *
                 * }
                 * else if (average > imbWEMManager.settings.crawlerJobEngine.CPULimit)
                 * {
                 *  __machine.maxThreads = Math.Min(__machine.maxThreads - 1, imbWEMManager.settings.crawlerJobEngine.TCAutoLimit);
                 *  loger.log("CPU average [" + average.ToString("P2") + "]  >  reducing TC to: " + __machine.maxThreads.ToString("D3") + " DLC:[" + __machine.taskRunning.ToString("D3") + "]");
                 * }
                 * else
                 * {
                 *  loger.log("CPU average [" + average.ToString("P2") + "]  ---------- TC_max: " + __machine.maxThreads.ToString("D3") + " DLC:[" + __machine.taskRunning.ToString("D3") + "]");
                 * }
                 */
                break;
            }
        }
示例#26
0
 public override void eventDLCInitiated <TParent>(TParent __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) => eventDLCInitiated(__parent as directReporterBase, __task, __wRecord);
示例#27
0
 public abstract void eventDLCFinished(experimentSessionEntry __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
 public override void eventDLCInitiated(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
 }
示例#29
0
        //public override void eventCrawlJobFinished(analyticJob aJob, crawlerDomainTaskMachine __machine, modelSpiderTestRecord __tRecord)
        //{
        //    throw new NotImplementedException();
        //}

        public override void eventDLCInitiated <TParent>(TParent __parent, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) => eventDLCInitiated(__parent as experimentSessionEntry, __task, __wRecord);
示例#30
0
 public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
     plugin_state.doCheckCriteria(__task.parent.parent, __wRecord.tRecord, this, imbWEMManager.index.experimentEntry);
 }