/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>; if (realTask == null) { return(next); } pipelineTaskMCPageSubject realSubject = realTask.subject; var tkns = mLanguageEval.GetAllProperTokensSortedByFrequency(realSubject.MCPage.TextContent, settings.tokenLengthMin); var mle = mLanguageEval.evaluate(settings, tkns); if (mle.result_language == languagePrimary) { Int32 vc = task.context.GetAndChangeCustomDataProperty("validPageCount_" + realSubject.parent.name, 1); if (vc > limitValidPageCount) { return(task.model.trashBin); } else { return(forward); } } else { return(task.model.trashBin); } }
/// <summary> /// It will be called by <see cref="M:imbNLP.PartOfSpeech.pipeline.machine.pipelineMachine.run(imbNLP.PartOfSpeech.pipeline.core.IPipelineModel)" /> method to get initial tasks to run /// </summary> /// <param name="resources">Arbitrary resources that might be used for task creation</param> /// <returns></returns> public override List <IPipelineTask> createPrimaryTasks(object[] resources) { String repoName = resources.getFirstOfType <String>(); List <String> targetNames = resources.getFirstOfType <List <String> >(); pipelineTaskMCRepoSubject subject = new pipelineTaskMCRepoSubject(); subject.MCRepoName = repoName; var tmp = resources.getFirstOfType <IDocumentSetClass>(false, null, true); if (tmp != null) { subject.WebSiteSample.AddRange(tmp.WebSiteSample); subject.MCSiteTargets.AddRange(tmp.WebSiteSample); } else { } pipelineTask <pipelineTaskMCRepoSubject> realTask = new pipelineTask <pipelineTaskMCRepoSubject>(subject); List <IPipelineTask> output = new List <IPipelineTask>(); output.Add(realTask); return(output); }
/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { var realTask = task as pipelineTask <pipelineTaskMCSiteSubject>; if (realTask == null) { return(next); } pipelineTaskMCSiteSubject realSubject = realTask.subject; var repoSubject = realSubject.parent as pipelineTaskMCRepoSubject; var repo = repoSubject.MCRepo; if (repo == null) { task.context.logger.log("MCRepo is null at [" + task.GetStringInfo() + "]"); } List <imbMCWebPage> listPages = repo.GetAllWebPages(realSubject.MCSite, null, takeSetup); if (doFilterOutDuplicates) { listPages = listPages.GetUniquePages(); } if (doSortPagesByTextSize) { listPages.Sort(SortByPageSize); } foreach (imbMCWebPage page in listPages) { var mCPageSubject = new pipelineTaskMCPageSubject(); imbMCDocument doc = new imbMCDocument(); doc.webPage = page; realSubject.mcElement.Add(doc); mCPageSubject.mcElement = doc; mCPageSubject.MCPage = page; // mCPageSubject.name = page.entry.HashCode; mCPageSubject.parent = realSubject; realSubject.Add(mCPageSubject); pipelineTask <pipelineTaskMCPageSubject> taskForPage = new pipelineTask <pipelineTaskMCPageSubject>(mCPageSubject); task.context.scheduledTasks.Push(taskForPage); } return(forward); }
public override IPipelineNode process(IPipelineTask task) { pipelineTask <T> realTask = task as pipelineTask <T>; if (realTask.subject.flagBag.ContainsByEnum(flags.ToArray(), queryType)) { return(forward); } return(next); }
/// <summary> /// Processes the specified task. /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>; if (realTask == null) { return(next); } pipelineTaskSubjectContentToken realSubject = realTask.subject; if (realSubject.mcElement != null) { HtmlNode node = realSubject.mcElement.htmlNode; List <String> tags = new List <string>(); if (node != null) { tags = node.GetTagNames(); foreach (var tag in tags) { switch (tag) { case "a": realSubject.flagBag.AddUnique(cnt_containerType.link); break; case "title": realSubject.flagBag.AddUnique(cnt_containerType.title); break; case "h": case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": realSubject.flagBag.AddUnique(cnt_containerType.title); break; } } } } // <---- tagging code return(forward); }
/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>; if (realTask == null) { return(next); } pipelineTaskMCPageSubject realSubject = realTask.subject; HtmlDocument html = new HtmlDocument(); html.LoadHtml(realSubject.MCPage.HtmlSourceCode); pipelineTaskMCSiteSubject siteSubject = realSubject.parent as pipelineTaskMCSiteSubject; realSubject.htmlDocument = html; List <imbMCBlock> blocks = blockComposer.process(html, realSubject.name); if (!blocks.Any()) { task.context.logger.log("Block composer returned zero blocks for [" + siteSubject.name + "]"); } foreach (imbMCBlock block in blocks) { pipelineTaskSubjectContentToken tokenSubject = new pipelineTaskSubjectContentToken(); tokenSubject.name = block.name; tokenSubject.contentLevelType = flags.token.cnt_level.mcBlock; tokenSubject.mcElement = block; tokenSubject.currentForm = block.content; realSubject.mcElement.Add(tokenSubject.mcElement); realSubject.Add(tokenSubject); pipelineTask <pipelineTaskSubjectContentToken> taskForElement = new pipelineTask <pipelineTaskSubjectContentToken>(tokenSubject); task.context.scheduledTasks.Push(taskForElement); } return(forward); }
/// <summary> /// Processes the specified task. /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>; if (realTask == null) { return(next); } pipelineTaskSubjectContentToken realSubject = realTask.subject; if (realSubject.contentLevelType != flags.token.cnt_level.mcTokenStream) { return(next); } List <imbMCToken> mcTokens = tokenComposer.process(realSubject.mcElement as imbMCStream); foreach (imbMCToken token in mcTokens) { pipelineTaskSubjectContentToken tokenSubject = new pipelineTaskSubjectContentToken(); tokenSubject.mcElement = token; realSubject.mcElement.Add(token); tokenSubject.name = token.name; tokenSubject.contentLevelType = flags.token.cnt_level.mcToken; tokenSubject.parent = realSubject; tokenSubject.currentForm = token.content; realSubject.Add(tokenSubject); pipelineTask <pipelineTaskSubjectContentToken> newTask = new pipelineTask <pipelineTaskSubjectContentToken>(tokenSubject); task.context.scheduledTasks.Push(newTask); } // <---- tagging code return(forward); }
/// <summary> /// Processes the specified task. /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>; if (realTask == null) { return(next); } pipelineTaskSubjectContentToken realSubject = realTask.subject; // <---- tagging code var flags = resolver.process(realSubject.currentForm); realSubject.flagBag.AddRange(flags, true); return(forward); }
/// <summary> /// Processes the specified task. /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { // <---- tagging code pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>; if (realTask == null) { return(next); } pipelineTaskSubjectContentToken realSubject = realTask.subject; if (realSubject.contentLevelType != cnt_level.mcToken) { return(next); } if (realSubject.currentForm == "εμάσ") { } var tkns = mLanguageEval.GetAllProperTokensSortedByFrequency(realSubject.currentForm, settings.tokenLengthMin); var mle = mLanguageEval.evaluate(settings, tkns); if (mle.languageEnums.Contains(languagePrimary)) { List <Object> l = new List <object>(); mle.languageEnums.ForEach(x => l.Add(x)); realSubject.flagBag.AddRange(l); return(forward); } else { return(task.model.trashBin); } return(forward); }
/// <summary> /// Processes the specified task. /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>; if (realTask == null) { return(next); } pipelineTaskSubjectContentToken realSubject = realTask.subject; if (realSubject.contentLevelType != flags.token.cnt_level.mcBlock) { return(next); } imbMCBlock mcBlock = realSubject.mcElement as imbMCBlock; var streams = streamComposer.process(mcBlock); foreach (imbMCStream stream in streams) { pipelineTaskSubjectContentToken streamSubject = new pipelineTaskSubjectContentToken(); streamSubject.contentLevelType = flags.token.cnt_level.mcTokenStream; streamSubject.mcElement = stream; streamSubject.name = stream.name; streamSubject.currentForm = stream.content; streamSubject.initialForm = stream.content; streamSubject.parent = realSubject; realSubject.Add(streamSubject); pipelineTask <pipelineTaskSubjectContentToken> newTask = new pipelineTask <pipelineTaskSubjectContentToken>(streamSubject); task.context.scheduledTasks.Push(newTask); } // <---- tagging code return(forward); }
/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskMCRepoSubject> realTask = task as pipelineTask <pipelineTaskMCRepoSubject>; if (realTask == null) { return(next); } pipelineTaskMCRepoSubject realSubject = realTask.subject; folderNode repoFolder = appManager.Application.folder_resources.Add(imbMCManager.MCRepo_DefaultDirectoryName, "MC Repositories", "Root directory with repositories of Crawled Web content"); imbMCRepository repo = realSubject.MCRepoName.LoadDataStructure <imbMCRepository>(repoFolder, task.context.logger); imbMCDocumentRepositorium docRepo = new imbMCDocumentRepositorium(); docRepo.webRepository = repo; realSubject.mcElement = docRepo; realSubject.MCRepo = repo; List <imbMCWebSite> websites = repo.GetAllWebSites(task.context.logger, takeSetup); List <imbMCWebSite> ws = new List <imbMCWebSite>(); //try //{ // repo.siteTable.Clear(); // repo.CheckSiteTable(task.context.logger); // if (realSubject.WebSiteSample.Any()) // { // foreach (String w in realSubject.WebSiteSample) // { // var iws = websites.FirstOrDefault(x => w.Contains(x.name)); //repo.GetWebSite(new domainAnalysis(w), false, task.context.logger); // if (iws != null) // { // task.context.logger.log(this.name + " Web site [ _" + w + "_ ] added to the pipeline: [" + repo.name + "]"); // websites.Add(iws); // } // else // { // task.context.logger.log(this.name + " Web site [ _" + w + "_] not found in the repo: [" + repo.name + "]"); // } // } // } // else // { // } //} catch (Exception ex) //{ // throw new aceGeneralException("Failed to recover web sites from the repository", ex, this, "Failed to load sites from repository: " + ex.Message); //} if (!websites.Any()) { task.context.logger.log(this.name + " Failed --- no web sites loaded"); } else { } List <String> needle = new List <string>(); realSubject.MCSiteTargets.ForEach(x => needle.Add(pipelineSubjectTools.GetCleanCaseName(x))); List <String> urls = new List <string>(); foreach (imbMCWebSite site in websites) { String sName = pipelineSubjectTools.GetCleanCaseName(site.domainInfo.urlProper); Boolean ok = true; if (realSubject.MCSiteTargets.Any()) { if (!needle.Contains(sName)) { ok = false; #if DEBUG //Console.WriteLine("Site refused [" + sName + "]"); #endif } } if (urls.Contains(sName)) { ok = false; } if (ok) { pipelineTaskMCSiteSubject mCSiteSubject = new pipelineTaskMCSiteSubject(); mCSiteSubject.MCSite = site; imbMCDocumentSet docSet = new imbMCDocumentSet(); docRepo.Add(docSet); mCSiteSubject.mcElement = docSet; mCSiteSubject.contentLevelType = flags.token.cnt_level.mcSite; mCSiteSubject.name = sName; mCSiteSubject.parent = realSubject; realSubject.Add(mCSiteSubject); urls.Add(mCSiteSubject.name); pipelineTask <pipelineTaskMCSiteSubject> taskForSite = new pipelineTask <pipelineTaskMCSiteSubject>(mCSiteSubject); task.context.scheduledTasks.Push(taskForSite); } } if (urls.Count < needle.Count) { urls.ForEach(x => needle.Remove(x)); if (needle.Any()) { String nd = ""; needle.ForEach(x => nd += x + " "); throw new aceScienceException("Some sites are not found in the MC Repository!! [" + nd + "]", null, realSubject, "Sites not loaded [" + nd + "]", this); } } return(forward); }
/// <summary> /// Processes the specified task. /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>; if (realTask == null) { return(next); } pipelineTaskSubjectContentToken realSubject = realTask.subject; if (realSubject.currentForm.isTokenStream()) { if (realSubject.currentForm.isWithLetterChars()) { if (realSubject.currentForm.isStrictSentenceCase()) { realSubject.flagBag.AddUnique(tkn_stream.sentenceProperCase); } if (realSubject.currentForm.isNoLowerCaseTokenStream()) { realSubject.flagBag.AddUnique(tkn_stream.titleAllCaps); } if (realSubject.currentForm.isEndsWithEnumerationPunctation()) { realSubject.flagBag.AddUnique(tkn_stream.titleForEnumeration); } if (realSubject.currentForm.isEndsWithExclamationPunctation()) { realSubject.flagBag.AddUnique(tkn_stream.sentenceEclamationEnd); } if (realSubject.currentForm.isEndsWithQuestionPunctation()) { realSubject.flagBag.AddUnique(tkn_stream.sentenceQuestionEnd); } } } if (realSubject.currentForm.isWithNumericChars()) { realSubject.flagBag.AddUnique(tkn_contains.number); if (realSubject.currentForm.isNumber()) { realSubject.flagBag.AddUnique(tkn_numeric.numberClean); } else if (realSubject.currentForm.isNumberFormatted()) { realSubject.flagBag.AddUnique(tkn_numeric.numberInFormat); } else if (realSubject.currentForm.isDecimalNumber()) { realSubject.flagBag.AddUnique(tkn_numeric.numberDecimal); } else if (realSubject.currentForm.isOrdinalNumber()) { realSubject.flagBag.AddUnique(tkn_numeric.numberOrdinal); } else { } if (realSubject.currentForm.isPercentageNumber()) { realSubject.flagBag.AddUnique(tkn_numeric.numberInPercentage); } } if (realSubject.currentForm.isWithLetterChars()) { realSubject.flagBag.AddUnique(tkn_contains.letter); if (realSubject.currentForm.isCleanWord()) { realSubject.flagBag.AddUnique(tkn_contains.onlyLetters); } if (realSubject.currentForm.isAllLowerLetterCaseWord()) { realSubject.flagBag.AddUnique(tkn_letterword.lowerCase); } else if (realSubject.currentForm.isFirstCapitalRestLowerCase()) { realSubject.flagBag.AddUnique(tkn_letterword.firstCapitalRestLower); } else if (realSubject.currentForm.isAllCapitalLetterCaseWord()) { realSubject.flagBag.AddUnique(tkn_letterword.upperCase); } else { realSubject.flagBag.AddUnique(tkn_letterword.inproperCase); } } if (realSubject.currentForm.isRegexMatch(@"\p{S}")) { realSubject.flagBag.AddUnique(tkn_contains.symbols); } if (realSubject.currentForm.isRegexMatch(@"\p{P}")) { realSubject.flagBag.AddUnique(tkn_contains.punctation); } switch (realSubject.contentLevelType) { case cnt_level.mcBlock: break; case cnt_level.mcTokenStream: break; case cnt_level.mcToken: var streamSubject = realSubject.parent as pipelineTaskSubjectContentToken; if (streamSubject.flagBag.ContainsAny(new Object[] { tkn_stream.sentenceProperCase })) { if (realSubject.flagBag.ContainsAll(new Object[] { tkn_letterword.upperCase, tkn_contains.letter })) { realSubject.flagBag.Add(tkn_potential.companyNamePart); } if (realSubject.flagBag.ContainsAll(new Object[] { tkn_letterword.firstCapitalRestLower, tkn_contains.onlyLetters })) { realSubject.flagBag.Add(tkn_potential.personName); } } break; } return(forward); }