/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>; if (realTask == null) { return(next); } pipelineTaskMCPageSubject realSubject = realTask.subject; var tkns = mLanguageEval.GetAllProperTokensSortedByFrequency(realSubject.MCPage.TextContent, settings.tokenLengthMin); var mle = mLanguageEval.evaluate(settings, tkns); if (mle.result_language == languagePrimary) { Int32 vc = task.context.GetAndChangeCustomDataProperty("validPageCount_" + realSubject.parent.name, 1); if (vc > limitValidPageCount) { return(task.model.trashBin); } else { return(forward); } } else { return(task.model.trashBin); } }
/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { var realTask = task as pipelineTask <pipelineTaskMCSiteSubject>; if (realTask == null) { return(next); } pipelineTaskMCSiteSubject realSubject = realTask.subject; var repoSubject = realSubject.parent as pipelineTaskMCRepoSubject; var repo = repoSubject.MCRepo; if (repo == null) { task.context.logger.log("MCRepo is null at [" + task.GetStringInfo() + "]"); } List <imbMCWebPage> listPages = repo.GetAllWebPages(realSubject.MCSite, null, takeSetup); if (doFilterOutDuplicates) { listPages = listPages.GetUniquePages(); } if (doSortPagesByTextSize) { listPages.Sort(SortByPageSize); } foreach (imbMCWebPage page in listPages) { var mCPageSubject = new pipelineTaskMCPageSubject(); imbMCDocument doc = new imbMCDocument(); doc.webPage = page; realSubject.mcElement.Add(doc); mCPageSubject.mcElement = doc; mCPageSubject.MCPage = page; // mCPageSubject.name = page.entry.HashCode; mCPageSubject.parent = realSubject; realSubject.Add(mCPageSubject); pipelineTask <pipelineTaskMCPageSubject> taskForPage = new pipelineTask <pipelineTaskMCPageSubject>(mCPageSubject); task.context.scheduledTasks.Push(taskForPage); } return(forward); }
/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>; if (realTask == null) { return(next); } pipelineTaskMCPageSubject realSubject = realTask.subject; HtmlDocument html = new HtmlDocument(); html.LoadHtml(realSubject.MCPage.HtmlSourceCode); pipelineTaskMCSiteSubject siteSubject = realSubject.parent as pipelineTaskMCSiteSubject; realSubject.htmlDocument = html; List <imbMCBlock> blocks = blockComposer.process(html, realSubject.name); if (!blocks.Any()) { task.context.logger.log("Block composer returned zero blocks for [" + siteSubject.name + "]"); } foreach (imbMCBlock block in blocks) { pipelineTaskSubjectContentToken tokenSubject = new pipelineTaskSubjectContentToken(); tokenSubject.name = block.name; tokenSubject.contentLevelType = flags.token.cnt_level.mcBlock; tokenSubject.mcElement = block; tokenSubject.currentForm = block.content; realSubject.mcElement.Add(tokenSubject.mcElement); realSubject.Add(tokenSubject); pipelineTask <pipelineTaskSubjectContentToken> taskForElement = new pipelineTask <pipelineTaskSubjectContentToken>(tokenSubject); task.context.scheduledTasks.Push(taskForElement); } return(forward); }