public override void eventUniversal(crawlReportingStageEnum stage, directReporterBase __parent, crawlerDomainTask __task, modelSpiderSiteRecord wRecord)
        {
            switch (stage)
            {
            case crawlReportingStageEnum.DLCPreinitiation:

                wRecord.context.OnTargetPageAttached += new modelSpiderSiteRecordEvent(onTargetPageAttached);

                imbMCRepository mcRepo = mcm.activeRepository;

                imbMCWebSite wRepo = mcRepo.GetWebSite(wRecord.domainInfo, true, loger);
                if (!webSiteReposByDomain.ContainsKey(wRecord.domain))
                {
                    webSiteReposByDomain.Add(wRecord.domain, wRepo);
                }
                else
                {
                    loger.log("DLC sent to CrawlToMC plugin second time: " + wRecord.domain);
                }

                mcRepo.siteTable.AddOrUpdate(wRepo.entry);

                wRepo.SaveDataStructure(mcRepo.folder, loger);
                break;
            }
        }
        public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            imbMCRepository mcRepo = mcm.activeRepository;
            imbMCWebSite    wRepo  = webSiteReposByDomain[__wRecord.domain];

            mcRepo.siteTable.AddOrUpdate(wRepo.entry);
            wRepo.SaveDataStructure(mcRepo.folder, loger);
        }
        private void onTargetPageAttached(modelSpiderSiteRecord __wRecord, modelSpiderSiteRecordEventArgs __args)
        {
            imbMCRepository mcRepo = mcm.activeRepository;
            imbMCWebSite    wRepo  = webSiteReposByDomain[__wRecord.domain];

            ISpiderTarget target = __args.Target;

            if (mcRepo.isTargetProper(target))
            {
                imbMCWebPage pRepo = mcRepo.BuildWebPage(target, wRepo, loger);
                pRepo.indexEntry     = imbWEMManager.index.pageIndexTable.GetPageForUrl(target.url);
                pRepo.HtmlSourceCode = __args.sourceHtml;
                pRepo.XmlSourceCode  = __args.sourceXml;

                pRepo.SaveDataStructure(wRepo.folder, loger);
            }
        }
        /// <summary>Opens or creates new MCRepository, starts a MC session. Call this before any other MC operation.</summary>
        /// <remarks><para>It initiates specified MCRepository and sets it as current/selected.</para></remarks>
        /// <param name="repo">Name of repository to start work with</param>
        /// <param name="log_msg">A message to be written into repository log after it is initiated, e.g. adding new MCWebSites, or running Data Mining procedure XXXX</param>
        /// <param name="debug">If true, it will print out short report on content of the repository (if any)</param>
        /// <seealso cref="aceOperationSetExecutorBase"/>
        public void aceOperation_mcOpen(
            [Description("Name of repository to start working with")] string repo = "word",
            [Description("A message to be written into repository log after it is initiated, e.g. adding new MCWebSites, or running Data Mining procedure XXXX")] string log_msg = "",
            [Description("If true, it will print out short report on content of the repository (if any)")] bool debug = false)
        {
            imbMCRepository instance = null;

            string path = folder.pathFor("\\" + repo);

            if (Directory.Exists(path))
            {
                instance = repo.LoadDataStructure <imbMCRepository>(folder, output);
                instance.loger.log("Repository loaded ".add(log_msg, ". "));
            }
            else
            {
                string descriptionForNew = "MC Repository created [" + DateTime.Now.ToLongDateString() + " " + DateTime.Now.ToLongTimeString() + "]. " + log_msg;
                instance = new imbMCRepository(repo, descriptionForNew, folder);
                instance.loger.log("Repository created ".add(log_msg, ". "));
            }

            if (debug)
            {
                instance.debugReport(output);
            }

            if (instance != null)
            {
                output.log("MC Repository [" + repo + "] initiated");
                activeRepository = instance;
            }
            else
            {
                output.log("MC Repository [" + repo + "] failed to initiate");
            }
        }
Beispiel #5
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCRepoSubject> realTask = task as pipelineTask <pipelineTaskMCRepoSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCRepoSubject realSubject = realTask.subject;



            folderNode repoFolder = appManager.Application.folder_resources.Add(imbMCManager.MCRepo_DefaultDirectoryName, "MC Repositories", "Root directory with repositories of Crawled Web content");



            imbMCRepository           repo    = realSubject.MCRepoName.LoadDataStructure <imbMCRepository>(repoFolder, task.context.logger);
            imbMCDocumentRepositorium docRepo = new imbMCDocumentRepositorium();

            docRepo.webRepository = repo;
            realSubject.mcElement = docRepo;
            realSubject.MCRepo    = repo;


            List <imbMCWebSite> websites = repo.GetAllWebSites(task.context.logger, takeSetup);
            List <imbMCWebSite> ws       = new List <imbMCWebSite>();



            //try
            //{
            //    repo.siteTable.Clear();

            //    repo.CheckSiteTable(task.context.logger);


            //    if (realSubject.WebSiteSample.Any())
            //    {
            //        foreach (String w in realSubject.WebSiteSample)
            //        {
            //            var iws = websites.FirstOrDefault(x => w.Contains(x.name));  //repo.GetWebSite(new domainAnalysis(w), false, task.context.logger);
            //            if (iws != null)
            //            {
            //                task.context.logger.log(this.name + " Web site [ _" + w + "_ ] added to the pipeline: [" + repo.name + "]");
            //                websites.Add(iws);
            //            }
            //            else
            //            {
            //                task.context.logger.log(this.name + " Web site [ _" + w + "_] not found in the repo: [" + repo.name + "]");
            //            }
            //        }
            //    }
            //    else
            //    {

            //    }
            //} catch (Exception ex)
            //{
            //    throw new aceGeneralException("Failed to recover web sites from the repository", ex, this, "Failed to load sites from repository: " + ex.Message);
            //}



            if (!websites.Any())
            {
                task.context.logger.log(this.name + " Failed --- no web sites loaded");
            }
            else
            {
            }

            List <String> needle = new List <string>();

            realSubject.MCSiteTargets.ForEach(x => needle.Add(pipelineSubjectTools.GetCleanCaseName(x)));


            List <String> urls = new List <string>();

            foreach (imbMCWebSite site in websites)
            {
                String sName = pipelineSubjectTools.GetCleanCaseName(site.domainInfo.urlProper);

                Boolean ok = true;

                if (realSubject.MCSiteTargets.Any())
                {
                    if (!needle.Contains(sName))
                    {
                        ok = false;

#if DEBUG
                        //Console.WriteLine("Site refused [" + sName + "]");
#endif
                    }
                }

                if (urls.Contains(sName))
                {
                    ok = false;
                }

                if (ok)
                {
                    pipelineTaskMCSiteSubject mCSiteSubject = new pipelineTaskMCSiteSubject();
                    mCSiteSubject.MCSite = site;


                    imbMCDocumentSet docSet = new imbMCDocumentSet();

                    docRepo.Add(docSet);
                    mCSiteSubject.mcElement        = docSet;
                    mCSiteSubject.contentLevelType = flags.token.cnt_level.mcSite;
                    mCSiteSubject.name             = sName;
                    mCSiteSubject.parent           = realSubject;
                    realSubject.Add(mCSiteSubject);


                    urls.Add(mCSiteSubject.name);

                    pipelineTask <pipelineTaskMCSiteSubject> taskForSite = new pipelineTask <pipelineTaskMCSiteSubject>(mCSiteSubject);

                    task.context.scheduledTasks.Push(taskForSite);
                }
            }

            if (urls.Count < needle.Count)
            {
                urls.ForEach(x => needle.Remove(x));

                if (needle.Any())
                {
                    String nd = "";
                    needle.ForEach(x => nd += x + " ");

                    throw new aceScienceException("Some sites are not found in the MC Repository!! [" + nd + "]", null, realSubject, "Sites not loaded [" + nd + "]", this);
                }
            }



            return(forward);
        }