示例#1
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCPageSubject realSubject = realTask.subject;

            HtmlDocument html = new HtmlDocument();

            html.LoadHtml(realSubject.MCPage.HtmlSourceCode);

            pipelineTaskMCSiteSubject siteSubject = realSubject.parent as pipelineTaskMCSiteSubject;

            realSubject.htmlDocument = html;

            List <imbMCBlock> blocks = blockComposer.process(html, realSubject.name);

            if (!blocks.Any())
            {
                task.context.logger.log("Block composer returned zero blocks for [" + siteSubject.name + "]");
            }


            foreach (imbMCBlock block in blocks)
            {
                pipelineTaskSubjectContentToken tokenSubject = new pipelineTaskSubjectContentToken();
                tokenSubject.name             = block.name;
                tokenSubject.contentLevelType = flags.token.cnt_level.mcBlock;
                tokenSubject.mcElement        = block;
                tokenSubject.currentForm      = block.content;
                realSubject.mcElement.Add(tokenSubject.mcElement);
                realSubject.Add(tokenSubject);


                pipelineTask <pipelineTaskSubjectContentToken> taskForElement = new pipelineTask <pipelineTaskSubjectContentToken>(tokenSubject);


                task.context.scheduledTasks.Push(taskForElement);
            }



            return(forward);
        }
        public void deploySite(pipelineTaskMCSiteSubject site)
        {
            name = site.name;

            Sites = 1;
            Pages = site.Count();

            foreach (var p in site)
            {
                if (p.Any())
                {
                    PagesValid++;
                    Blocks += p.Count();
                    foreach (var b in p)
                    {
                        Streams += b.Count();
                    }
                }
            }
        }
示例#3
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCRepoSubject> realTask = task as pipelineTask <pipelineTaskMCRepoSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCRepoSubject realSubject = realTask.subject;



            folderNode repoFolder = appManager.Application.folder_resources.Add(imbMCManager.MCRepo_DefaultDirectoryName, "MC Repositories", "Root directory with repositories of Crawled Web content");



            imbMCRepository           repo    = realSubject.MCRepoName.LoadDataStructure <imbMCRepository>(repoFolder, task.context.logger);
            imbMCDocumentRepositorium docRepo = new imbMCDocumentRepositorium();

            docRepo.webRepository = repo;
            realSubject.mcElement = docRepo;
            realSubject.MCRepo    = repo;


            List <imbMCWebSite> websites = repo.GetAllWebSites(task.context.logger, takeSetup);
            List <imbMCWebSite> ws       = new List <imbMCWebSite>();



            //try
            //{
            //    repo.siteTable.Clear();

            //    repo.CheckSiteTable(task.context.logger);


            //    if (realSubject.WebSiteSample.Any())
            //    {
            //        foreach (String w in realSubject.WebSiteSample)
            //        {
            //            var iws = websites.FirstOrDefault(x => w.Contains(x.name));  //repo.GetWebSite(new domainAnalysis(w), false, task.context.logger);
            //            if (iws != null)
            //            {
            //                task.context.logger.log(this.name + " Web site [ _" + w + "_ ] added to the pipeline: [" + repo.name + "]");
            //                websites.Add(iws);
            //            }
            //            else
            //            {
            //                task.context.logger.log(this.name + " Web site [ _" + w + "_] not found in the repo: [" + repo.name + "]");
            //            }
            //        }
            //    }
            //    else
            //    {

            //    }
            //} catch (Exception ex)
            //{
            //    throw new aceGeneralException("Failed to recover web sites from the repository", ex, this, "Failed to load sites from repository: " + ex.Message);
            //}



            if (!websites.Any())
            {
                task.context.logger.log(this.name + " Failed --- no web sites loaded");
            }
            else
            {
            }

            List <String> needle = new List <string>();

            realSubject.MCSiteTargets.ForEach(x => needle.Add(pipelineSubjectTools.GetCleanCaseName(x)));


            List <String> urls = new List <string>();

            foreach (imbMCWebSite site in websites)
            {
                String sName = pipelineSubjectTools.GetCleanCaseName(site.domainInfo.urlProper);

                Boolean ok = true;

                if (realSubject.MCSiteTargets.Any())
                {
                    if (!needle.Contains(sName))
                    {
                        ok = false;

#if DEBUG
                        //Console.WriteLine("Site refused [" + sName + "]");
#endif
                    }
                }

                if (urls.Contains(sName))
                {
                    ok = false;
                }

                if (ok)
                {
                    pipelineTaskMCSiteSubject mCSiteSubject = new pipelineTaskMCSiteSubject();
                    mCSiteSubject.MCSite = site;


                    imbMCDocumentSet docSet = new imbMCDocumentSet();

                    docRepo.Add(docSet);
                    mCSiteSubject.mcElement        = docSet;
                    mCSiteSubject.contentLevelType = flags.token.cnt_level.mcSite;
                    mCSiteSubject.name             = sName;
                    mCSiteSubject.parent           = realSubject;
                    realSubject.Add(mCSiteSubject);


                    urls.Add(mCSiteSubject.name);

                    pipelineTask <pipelineTaskMCSiteSubject> taskForSite = new pipelineTask <pipelineTaskMCSiteSubject>(mCSiteSubject);

                    task.context.scheduledTasks.Push(taskForSite);
                }
            }

            if (urls.Count < needle.Count)
            {
                urls.ForEach(x => needle.Remove(x));

                if (needle.Any())
                {
                    String nd = "";
                    needle.ForEach(x => nd += x + " ");

                    throw new aceScienceException("Some sites are not found in the MC Repository!! [" + nd + "]", null, realSubject, "Sites not loaded [" + nd + "]", this);
                }
            }



            return(forward);
        }
示例#4
0
 public List <pipelineTaskSubjectContentToken> GetTokensForSite(pipelineTaskMCSiteSubject site)
 {
     return(this.tokenBySite[site].ToList());
 }
示例#5
0
 public webCaseKnowledge(pipelineTaskMCSiteSubject _MCSiteSubject, IDocumentSetClass classSet)
 {
     SetSiteSubject(_MCSiteSubject);
     name     = MCSite.domainInfo.domainRootName;
     industry = classSet;
 }
示例#6
0
 public void SetSiteSubject(pipelineTaskMCSiteSubject _MCSiteSubject)
 {
     MCSiteSubject = _MCSiteSubject;
     MCSite        = MCSiteSubject.MCSite;
     MCSiteElement = MCSiteSubject.mcElement;
 }