/// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>;

            if (realTask == null)
            {
                return(next);
            }


            pipelineTaskMCPageSubject realSubject = realTask.subject;
            var tkns = mLanguageEval.GetAllProperTokensSortedByFrequency(realSubject.MCPage.TextContent, settings.tokenLengthMin);

            var mle = mLanguageEval.evaluate(settings, tkns);


            if (mle.result_language == languagePrimary)
            {
                Int32 vc = task.context.GetAndChangeCustomDataProperty("validPageCount_" + realSubject.parent.name, 1);
                if (vc > limitValidPageCount)
                {
                    return(task.model.trashBin);
                }
                else
                {
                    return(forward);
                }
            }
            else
            {
                return(task.model.trashBin);
            }
        }
Example #2
0
        /// <summary>
        /// It will be called by <see cref="M:imbNLP.PartOfSpeech.pipeline.machine.pipelineMachine.run(imbNLP.PartOfSpeech.pipeline.core.IPipelineModel)" /> method to get initial tasks to run
        /// </summary>
        /// <param name="resources">Arbitrary resources that might be used for task creation</param>
        /// <returns></returns>
        public override List <IPipelineTask> createPrimaryTasks(object[] resources)
        {
            String        repoName    = resources.getFirstOfType <String>();
            List <String> targetNames = resources.getFirstOfType <List <String> >();

            pipelineTaskMCRepoSubject subject = new pipelineTaskMCRepoSubject();

            subject.MCRepoName = repoName;
            var tmp = resources.getFirstOfType <IDocumentSetClass>(false, null, true);

            if (tmp != null)
            {
                subject.WebSiteSample.AddRange(tmp.WebSiteSample);
                subject.MCSiteTargets.AddRange(tmp.WebSiteSample);
            }
            else
            {
            }


            pipelineTask <pipelineTaskMCRepoSubject> realTask = new pipelineTask <pipelineTaskMCRepoSubject>(subject);

            List <IPipelineTask> output = new List <IPipelineTask>();

            output.Add(realTask);
            return(output);
        }
Example #3
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            var realTask = task as pipelineTask <pipelineTaskMCSiteSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCSiteSubject realSubject = realTask.subject;

            var repoSubject = realSubject.parent as pipelineTaskMCRepoSubject;

            var repo = repoSubject.MCRepo;

            if (repo == null)
            {
                task.context.logger.log("MCRepo is null at [" + task.GetStringInfo() + "]");
            }

            List <imbMCWebPage> listPages = repo.GetAllWebPages(realSubject.MCSite, null, takeSetup);

            if (doFilterOutDuplicates)
            {
                listPages = listPages.GetUniquePages();
            }

            if (doSortPagesByTextSize)
            {
                listPages.Sort(SortByPageSize);
            }

            foreach (imbMCWebPage page in listPages)
            {
                var mCPageSubject = new pipelineTaskMCPageSubject();

                imbMCDocument doc = new imbMCDocument();
                doc.webPage = page;
                realSubject.mcElement.Add(doc);

                mCPageSubject.mcElement = doc;
                mCPageSubject.MCPage    = page;
                // mCPageSubject.name = page.entry.HashCode;
                mCPageSubject.parent = realSubject;

                realSubject.Add(mCPageSubject);

                pipelineTask <pipelineTaskMCPageSubject> taskForPage = new pipelineTask <pipelineTaskMCPageSubject>(mCPageSubject);

                task.context.scheduledTasks.Push(taskForPage);
            }

            return(forward);
        }
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <T> realTask = task as pipelineTask <T>;

            if (realTask.subject.flagBag.ContainsByEnum(flags.ToArray(), queryType))
            {
                return(forward);
            }

            return(next);
        }
        /// <summary>
        /// Processes the specified task.
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskSubjectContentToken realSubject = realTask.subject;

            if (realSubject.mcElement != null)
            {
                HtmlNode node = realSubject.mcElement.htmlNode;

                List <String> tags = new List <string>();


                if (node != null)
                {
                    tags = node.GetTagNames();

                    foreach (var tag in tags)
                    {
                        switch (tag)
                        {
                        case "a":
                            realSubject.flagBag.AddUnique(cnt_containerType.link);
                            break;

                        case "title":
                            realSubject.flagBag.AddUnique(cnt_containerType.title);
                            break;

                        case "h":
                        case "h1":
                        case "h2":
                        case "h3":
                        case "h4":
                        case "h5":
                        case "h6":
                            realSubject.flagBag.AddUnique(cnt_containerType.title);
                            break;
                        }
                    }
                }
            }

            // <---- tagging code

            return(forward);
        }
Example #6
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCPageSubject realSubject = realTask.subject;

            HtmlDocument html = new HtmlDocument();

            html.LoadHtml(realSubject.MCPage.HtmlSourceCode);

            pipelineTaskMCSiteSubject siteSubject = realSubject.parent as pipelineTaskMCSiteSubject;

            realSubject.htmlDocument = html;

            List <imbMCBlock> blocks = blockComposer.process(html, realSubject.name);

            if (!blocks.Any())
            {
                task.context.logger.log("Block composer returned zero blocks for [" + siteSubject.name + "]");
            }


            foreach (imbMCBlock block in blocks)
            {
                pipelineTaskSubjectContentToken tokenSubject = new pipelineTaskSubjectContentToken();
                tokenSubject.name             = block.name;
                tokenSubject.contentLevelType = flags.token.cnt_level.mcBlock;
                tokenSubject.mcElement        = block;
                tokenSubject.currentForm      = block.content;
                realSubject.mcElement.Add(tokenSubject.mcElement);
                realSubject.Add(tokenSubject);


                pipelineTask <pipelineTaskSubjectContentToken> taskForElement = new pipelineTask <pipelineTaskSubjectContentToken>(tokenSubject);


                task.context.scheduledTasks.Push(taskForElement);
            }



            return(forward);
        }
        /// <summary>
        /// Processes the specified task.
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskSubjectContentToken realSubject = realTask.subject;

            if (realSubject.contentLevelType != flags.token.cnt_level.mcTokenStream)
            {
                return(next);
            }


            List <imbMCToken> mcTokens = tokenComposer.process(realSubject.mcElement as imbMCStream);


            foreach (imbMCToken token in mcTokens)
            {
                pipelineTaskSubjectContentToken tokenSubject = new pipelineTaskSubjectContentToken();
                tokenSubject.mcElement = token;

                realSubject.mcElement.Add(token);

                tokenSubject.name             = token.name;
                tokenSubject.contentLevelType = flags.token.cnt_level.mcToken;
                tokenSubject.parent           = realSubject;
                tokenSubject.currentForm      = token.content;

                realSubject.Add(tokenSubject);

                pipelineTask <pipelineTaskSubjectContentToken> newTask = new pipelineTask <pipelineTaskSubjectContentToken>(tokenSubject);

                task.context.scheduledTasks.Push(newTask);
            }


            // <---- tagging code

            return(forward);
        }
Example #8
0
        /// <summary>
        /// Processes the specified task.
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskSubjectContentToken realSubject = realTask.subject;

            // <---- tagging code

            var flags = resolver.process(realSubject.currentForm);

            realSubject.flagBag.AddRange(flags, true);

            return(forward);
        }
Example #9
0
        /// <summary>
        /// Processes the specified task.
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            // <---- tagging code

            pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>;

            if (realTask == null)
            {
                return(next);
            }


            pipelineTaskSubjectContentToken realSubject = realTask.subject;

            if (realSubject.contentLevelType != cnt_level.mcToken)
            {
                return(next);
            }

            if (realSubject.currentForm == "εμάσ")
            {
            }

            var tkns = mLanguageEval.GetAllProperTokensSortedByFrequency(realSubject.currentForm, settings.tokenLengthMin);

            var mle = mLanguageEval.evaluate(settings, tkns);


            if (mle.languageEnums.Contains(languagePrimary))
            {
                List <Object> l = new List <object>();
                mle.languageEnums.ForEach(x => l.Add(x));
                realSubject.flagBag.AddRange(l);

                return(forward);
            }
            else
            {
                return(task.model.trashBin);
            }

            return(forward);
        }
        /// <summary>
        /// Processes the specified task.
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskSubjectContentToken realSubject = realTask.subject;

            if (realSubject.contentLevelType != flags.token.cnt_level.mcBlock)
            {
                return(next);
            }


            imbMCBlock mcBlock = realSubject.mcElement as imbMCBlock;


            var streams = streamComposer.process(mcBlock);

            foreach (imbMCStream stream in streams)
            {
                pipelineTaskSubjectContentToken streamSubject = new pipelineTaskSubjectContentToken();
                streamSubject.contentLevelType = flags.token.cnt_level.mcTokenStream;
                streamSubject.mcElement        = stream;
                streamSubject.name             = stream.name;
                streamSubject.currentForm      = stream.content;
                streamSubject.initialForm      = stream.content;
                streamSubject.parent           = realSubject;
                realSubject.Add(streamSubject);

                pipelineTask <pipelineTaskSubjectContentToken> newTask = new pipelineTask <pipelineTaskSubjectContentToken>(streamSubject);
                task.context.scheduledTasks.Push(newTask);
            }


            // <---- tagging code

            return(forward);
        }
Example #11
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCRepoSubject> realTask = task as pipelineTask <pipelineTaskMCRepoSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCRepoSubject realSubject = realTask.subject;



            folderNode repoFolder = appManager.Application.folder_resources.Add(imbMCManager.MCRepo_DefaultDirectoryName, "MC Repositories", "Root directory with repositories of Crawled Web content");



            imbMCRepository           repo    = realSubject.MCRepoName.LoadDataStructure <imbMCRepository>(repoFolder, task.context.logger);
            imbMCDocumentRepositorium docRepo = new imbMCDocumentRepositorium();

            docRepo.webRepository = repo;
            realSubject.mcElement = docRepo;
            realSubject.MCRepo    = repo;


            List <imbMCWebSite> websites = repo.GetAllWebSites(task.context.logger, takeSetup);
            List <imbMCWebSite> ws       = new List <imbMCWebSite>();



            //try
            //{
            //    repo.siteTable.Clear();

            //    repo.CheckSiteTable(task.context.logger);


            //    if (realSubject.WebSiteSample.Any())
            //    {
            //        foreach (String w in realSubject.WebSiteSample)
            //        {
            //            var iws = websites.FirstOrDefault(x => w.Contains(x.name));  //repo.GetWebSite(new domainAnalysis(w), false, task.context.logger);
            //            if (iws != null)
            //            {
            //                task.context.logger.log(this.name + " Web site [ _" + w + "_ ] added to the pipeline: [" + repo.name + "]");
            //                websites.Add(iws);
            //            }
            //            else
            //            {
            //                task.context.logger.log(this.name + " Web site [ _" + w + "_] not found in the repo: [" + repo.name + "]");
            //            }
            //        }
            //    }
            //    else
            //    {

            //    }
            //} catch (Exception ex)
            //{
            //    throw new aceGeneralException("Failed to recover web sites from the repository", ex, this, "Failed to load sites from repository: " + ex.Message);
            //}



            if (!websites.Any())
            {
                task.context.logger.log(this.name + " Failed --- no web sites loaded");
            }
            else
            {
            }

            List <String> needle = new List <string>();

            realSubject.MCSiteTargets.ForEach(x => needle.Add(pipelineSubjectTools.GetCleanCaseName(x)));


            List <String> urls = new List <string>();

            foreach (imbMCWebSite site in websites)
            {
                String sName = pipelineSubjectTools.GetCleanCaseName(site.domainInfo.urlProper);

                Boolean ok = true;

                if (realSubject.MCSiteTargets.Any())
                {
                    if (!needle.Contains(sName))
                    {
                        ok = false;

#if DEBUG
                        //Console.WriteLine("Site refused [" + sName + "]");
#endif
                    }
                }

                if (urls.Contains(sName))
                {
                    ok = false;
                }

                if (ok)
                {
                    pipelineTaskMCSiteSubject mCSiteSubject = new pipelineTaskMCSiteSubject();
                    mCSiteSubject.MCSite = site;


                    imbMCDocumentSet docSet = new imbMCDocumentSet();

                    docRepo.Add(docSet);
                    mCSiteSubject.mcElement        = docSet;
                    mCSiteSubject.contentLevelType = flags.token.cnt_level.mcSite;
                    mCSiteSubject.name             = sName;
                    mCSiteSubject.parent           = realSubject;
                    realSubject.Add(mCSiteSubject);


                    urls.Add(mCSiteSubject.name);

                    pipelineTask <pipelineTaskMCSiteSubject> taskForSite = new pipelineTask <pipelineTaskMCSiteSubject>(mCSiteSubject);

                    task.context.scheduledTasks.Push(taskForSite);
                }
            }

            if (urls.Count < needle.Count)
            {
                urls.ForEach(x => needle.Remove(x));

                if (needle.Any())
                {
                    String nd = "";
                    needle.ForEach(x => nd += x + " ");

                    throw new aceScienceException("Some sites are not found in the MC Repository!! [" + nd + "]", null, realSubject, "Sites not loaded [" + nd + "]", this);
                }
            }



            return(forward);
        }
        /// <summary>
        /// Processes the specified task.
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskSubjectContentToken> realTask = task as pipelineTask <pipelineTaskSubjectContentToken>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskSubjectContentToken realSubject = realTask.subject;

            if (realSubject.currentForm.isTokenStream())
            {
                if (realSubject.currentForm.isWithLetterChars())
                {
                    if (realSubject.currentForm.isStrictSentenceCase())
                    {
                        realSubject.flagBag.AddUnique(tkn_stream.sentenceProperCase);
                    }

                    if (realSubject.currentForm.isNoLowerCaseTokenStream())
                    {
                        realSubject.flagBag.AddUnique(tkn_stream.titleAllCaps);
                    }

                    if (realSubject.currentForm.isEndsWithEnumerationPunctation())
                    {
                        realSubject.flagBag.AddUnique(tkn_stream.titleForEnumeration);
                    }

                    if (realSubject.currentForm.isEndsWithExclamationPunctation())
                    {
                        realSubject.flagBag.AddUnique(tkn_stream.sentenceEclamationEnd);
                    }

                    if (realSubject.currentForm.isEndsWithQuestionPunctation())
                    {
                        realSubject.flagBag.AddUnique(tkn_stream.sentenceQuestionEnd);
                    }
                }
            }

            if (realSubject.currentForm.isWithNumericChars())
            {
                realSubject.flagBag.AddUnique(tkn_contains.number);

                if (realSubject.currentForm.isNumber())
                {
                    realSubject.flagBag.AddUnique(tkn_numeric.numberClean);
                }
                else if (realSubject.currentForm.isNumberFormatted())
                {
                    realSubject.flagBag.AddUnique(tkn_numeric.numberInFormat);
                }
                else if (realSubject.currentForm.isDecimalNumber())
                {
                    realSubject.flagBag.AddUnique(tkn_numeric.numberDecimal);
                }
                else if (realSubject.currentForm.isOrdinalNumber())
                {
                    realSubject.flagBag.AddUnique(tkn_numeric.numberOrdinal);
                }
                else
                {
                }

                if (realSubject.currentForm.isPercentageNumber())
                {
                    realSubject.flagBag.AddUnique(tkn_numeric.numberInPercentage);
                }
            }

            if (realSubject.currentForm.isWithLetterChars())
            {
                realSubject.flagBag.AddUnique(tkn_contains.letter);

                if (realSubject.currentForm.isCleanWord())
                {
                    realSubject.flagBag.AddUnique(tkn_contains.onlyLetters);
                }

                if (realSubject.currentForm.isAllLowerLetterCaseWord())
                {
                    realSubject.flagBag.AddUnique(tkn_letterword.lowerCase);
                }
                else if (realSubject.currentForm.isFirstCapitalRestLowerCase())
                {
                    realSubject.flagBag.AddUnique(tkn_letterword.firstCapitalRestLower);
                }
                else if (realSubject.currentForm.isAllCapitalLetterCaseWord())
                {
                    realSubject.flagBag.AddUnique(tkn_letterword.upperCase);
                }
                else
                {
                    realSubject.flagBag.AddUnique(tkn_letterword.inproperCase);
                }
            }

            if (realSubject.currentForm.isRegexMatch(@"\p{S}"))
            {
                realSubject.flagBag.AddUnique(tkn_contains.symbols);
            }

            if (realSubject.currentForm.isRegexMatch(@"\p{P}"))
            {
                realSubject.flagBag.AddUnique(tkn_contains.punctation);
            }

            switch (realSubject.contentLevelType)
            {
            case cnt_level.mcBlock:
                break;

            case cnt_level.mcTokenStream:
                break;

            case cnt_level.mcToken:

                var streamSubject = realSubject.parent as pipelineTaskSubjectContentToken;

                if (streamSubject.flagBag.ContainsAny(new Object[] { tkn_stream.sentenceProperCase }))
                {
                    if (realSubject.flagBag.ContainsAll(new Object[] { tkn_letterword.upperCase, tkn_contains.letter }))
                    {
                        realSubject.flagBag.Add(tkn_potential.companyNamePart);
                    }

                    if (realSubject.flagBag.ContainsAll(new Object[] { tkn_letterword.firstCapitalRestLower, tkn_contains.onlyLetters }))
                    {
                        realSubject.flagBag.Add(tkn_potential.personName);
                    }
                }
                break;
            }

            return(forward);
        }