/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>; if (realTask == null) { return(next); } pipelineTaskMCPageSubject realSubject = realTask.subject; HtmlDocument html = new HtmlDocument(); html.LoadHtml(realSubject.MCPage.HtmlSourceCode); pipelineTaskMCSiteSubject siteSubject = realSubject.parent as pipelineTaskMCSiteSubject; realSubject.htmlDocument = html; List <imbMCBlock> blocks = blockComposer.process(html, realSubject.name); if (!blocks.Any()) { task.context.logger.log("Block composer returned zero blocks for [" + siteSubject.name + "]"); } foreach (imbMCBlock block in blocks) { pipelineTaskSubjectContentToken tokenSubject = new pipelineTaskSubjectContentToken(); tokenSubject.name = block.name; tokenSubject.contentLevelType = flags.token.cnt_level.mcBlock; tokenSubject.mcElement = block; tokenSubject.currentForm = block.content; realSubject.mcElement.Add(tokenSubject.mcElement); realSubject.Add(tokenSubject); pipelineTask <pipelineTaskSubjectContentToken> taskForElement = new pipelineTask <pipelineTaskSubjectContentToken>(tokenSubject); task.context.scheduledTasks.Push(taskForElement); } return(forward); }
public void deploySite(pipelineTaskMCSiteSubject site) { name = site.name; Sites = 1; Pages = site.Count(); foreach (var p in site) { if (p.Any()) { PagesValid++; Blocks += p.Count(); foreach (var b in p) { Streams += b.Count(); } } } }
/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskMCRepoSubject> realTask = task as pipelineTask <pipelineTaskMCRepoSubject>; if (realTask == null) { return(next); } pipelineTaskMCRepoSubject realSubject = realTask.subject; folderNode repoFolder = appManager.Application.folder_resources.Add(imbMCManager.MCRepo_DefaultDirectoryName, "MC Repositories", "Root directory with repositories of Crawled Web content"); imbMCRepository repo = realSubject.MCRepoName.LoadDataStructure <imbMCRepository>(repoFolder, task.context.logger); imbMCDocumentRepositorium docRepo = new imbMCDocumentRepositorium(); docRepo.webRepository = repo; realSubject.mcElement = docRepo; realSubject.MCRepo = repo; List <imbMCWebSite> websites = repo.GetAllWebSites(task.context.logger, takeSetup); List <imbMCWebSite> ws = new List <imbMCWebSite>(); //try //{ // repo.siteTable.Clear(); // repo.CheckSiteTable(task.context.logger); // if (realSubject.WebSiteSample.Any()) // { // foreach (String w in realSubject.WebSiteSample) // { // var iws = websites.FirstOrDefault(x => w.Contains(x.name)); //repo.GetWebSite(new domainAnalysis(w), false, task.context.logger); // if (iws != null) // { // task.context.logger.log(this.name + " Web site [ _" + w + "_ ] added to the pipeline: [" + repo.name + "]"); // websites.Add(iws); // } // else // { // task.context.logger.log(this.name + " Web site [ _" + w + "_] not found in the repo: [" + repo.name + "]"); // } // } // } // else // { // } //} catch (Exception ex) //{ // throw new aceGeneralException("Failed to recover web sites from the repository", ex, this, "Failed to load sites from repository: " + ex.Message); //} if (!websites.Any()) { task.context.logger.log(this.name + " Failed --- no web sites loaded"); } else { } List <String> needle = new List <string>(); realSubject.MCSiteTargets.ForEach(x => needle.Add(pipelineSubjectTools.GetCleanCaseName(x))); List <String> urls = new List <string>(); foreach (imbMCWebSite site in websites) { String sName = pipelineSubjectTools.GetCleanCaseName(site.domainInfo.urlProper); Boolean ok = true; if (realSubject.MCSiteTargets.Any()) { if (!needle.Contains(sName)) { ok = false; #if DEBUG //Console.WriteLine("Site refused [" + sName + "]"); #endif } } if (urls.Contains(sName)) { ok = false; } if (ok) { pipelineTaskMCSiteSubject mCSiteSubject = new pipelineTaskMCSiteSubject(); mCSiteSubject.MCSite = site; imbMCDocumentSet docSet = new imbMCDocumentSet(); docRepo.Add(docSet); mCSiteSubject.mcElement = docSet; mCSiteSubject.contentLevelType = flags.token.cnt_level.mcSite; mCSiteSubject.name = sName; mCSiteSubject.parent = realSubject; realSubject.Add(mCSiteSubject); urls.Add(mCSiteSubject.name); pipelineTask <pipelineTaskMCSiteSubject> taskForSite = new pipelineTask <pipelineTaskMCSiteSubject>(mCSiteSubject); task.context.scheduledTasks.Push(taskForSite); } } if (urls.Count < needle.Count) { urls.ForEach(x => needle.Remove(x)); if (needle.Any()) { String nd = ""; needle.ForEach(x => nd += x + " "); throw new aceScienceException("Some sites are not found in the MC Repository!! [" + nd + "]", null, realSubject, "Sites not loaded [" + nd + "]", this); } } return(forward); }
public List <pipelineTaskSubjectContentToken> GetTokensForSite(pipelineTaskMCSiteSubject site) { return(this.tokenBySite[site].ToList()); }
public webCaseKnowledge(pipelineTaskMCSiteSubject _MCSiteSubject, IDocumentSetClass classSet) { SetSiteSubject(_MCSiteSubject); name = MCSite.domainInfo.domainRootName; industry = classSet; }
public void SetSiteSubject(pipelineTaskMCSiteSubject _MCSiteSubject) { MCSiteSubject = _MCSiteSubject; MCSite = MCSiteSubject.MCSite; MCSiteElement = MCSiteSubject.mcElement; }