/// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>;

            if (realTask == null)
            {
                return(next);
            }


            pipelineTaskMCPageSubject realSubject = realTask.subject;
            var tkns = mLanguageEval.GetAllProperTokensSortedByFrequency(realSubject.MCPage.TextContent, settings.tokenLengthMin);

            var mle = mLanguageEval.evaluate(settings, tkns);


            if (mle.result_language == languagePrimary)
            {
                Int32 vc = task.context.GetAndChangeCustomDataProperty("validPageCount_" + realSubject.parent.name, 1);
                if (vc > limitValidPageCount)
                {
                    return(task.model.trashBin);
                }
                else
                {
                    return(forward);
                }
            }
            else
            {
                return(task.model.trashBin);
            }
        }
Beispiel #2
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            var realTask = task as pipelineTask <pipelineTaskMCSiteSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCSiteSubject realSubject = realTask.subject;

            var repoSubject = realSubject.parent as pipelineTaskMCRepoSubject;

            var repo = repoSubject.MCRepo;

            if (repo == null)
            {
                task.context.logger.log("MCRepo is null at [" + task.GetStringInfo() + "]");
            }

            List <imbMCWebPage> listPages = repo.GetAllWebPages(realSubject.MCSite, null, takeSetup);

            if (doFilterOutDuplicates)
            {
                listPages = listPages.GetUniquePages();
            }

            if (doSortPagesByTextSize)
            {
                listPages.Sort(SortByPageSize);
            }

            foreach (imbMCWebPage page in listPages)
            {
                var mCPageSubject = new pipelineTaskMCPageSubject();

                imbMCDocument doc = new imbMCDocument();
                doc.webPage = page;
                realSubject.mcElement.Add(doc);

                mCPageSubject.mcElement = doc;
                mCPageSubject.MCPage    = page;
                // mCPageSubject.name = page.entry.HashCode;
                mCPageSubject.parent = realSubject;

                realSubject.Add(mCPageSubject);

                pipelineTask <pipelineTaskMCPageSubject> taskForPage = new pipelineTask <pipelineTaskMCPageSubject>(mCPageSubject);

                task.context.scheduledTasks.Push(taskForPage);
            }

            return(forward);
        }
Beispiel #3
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCPageSubject realSubject = realTask.subject;

            HtmlDocument html = new HtmlDocument();

            html.LoadHtml(realSubject.MCPage.HtmlSourceCode);

            pipelineTaskMCSiteSubject siteSubject = realSubject.parent as pipelineTaskMCSiteSubject;

            realSubject.htmlDocument = html;

            List <imbMCBlock> blocks = blockComposer.process(html, realSubject.name);

            if (!blocks.Any())
            {
                task.context.logger.log("Block composer returned zero blocks for [" + siteSubject.name + "]");
            }


            foreach (imbMCBlock block in blocks)
            {
                pipelineTaskSubjectContentToken tokenSubject = new pipelineTaskSubjectContentToken();
                tokenSubject.name             = block.name;
                tokenSubject.contentLevelType = flags.token.cnt_level.mcBlock;
                tokenSubject.mcElement        = block;
                tokenSubject.currentForm      = block.content;
                realSubject.mcElement.Add(tokenSubject.mcElement);
                realSubject.Add(tokenSubject);


                pipelineTask <pipelineTaskSubjectContentToken> taskForElement = new pipelineTask <pipelineTaskSubjectContentToken>(tokenSubject);


                task.context.scheduledTasks.Push(taskForElement);
            }



            return(forward);
        }