示例#1
0
        /// <summary>Defines new instance of the specified crawler. LT_t defines link take per iteration, I_max is iteration limit, PL_max defines max. page loads, PS_c is count of selected pages at end.</summary>
        /// <remarks><para>New crawler is attached to the AnalyticJobRecord and set as current on the state level</para></remarks>
        /// <param name="classname">Name of the crawler class</param>
        /// <param name="LT_t">Load take - number of parallel loads</param>
        /// <param name="I_max">Iteration number limit</param>
        /// <param name="PL_max">Page Loads limit</param>
        /// <param name="instanceNameSufix">Crawler name sufix</param>
        /// <seealso cref="aceOperationSetExecutorBase"/>
        public void aceOperation_addCrawler(
            [Description("Name of the crawler class")] String classname      = "SM_LTS",
            [Description("Load take - number of parallel loads")] Int32 LT_t = 1,
            [Description("Iteration number limit")] Int32 I_max               = 100,
            [Description("Page Loads limit")] Int32 PL_max                    = 50,
            [Description("Crawler name sufix")] String instanceNameSufix      = "",
            [Description("Primary language")] basicLanguageEnum primLanguage  = basicLanguageEnum.serbian,
            [Description("Secondary language")] basicLanguageEnum secLanguage = basicLanguageEnum.english
            )
        {
            if (context.aRecord == null)
            {
                output.log("Error: define Job before calling this command.");
                return;
            }


            var evaluator = wemTypesManager.wemTypes.crawlerTypes.GetInstance(classname, output);

            if (evaluator != null)
            {
                evaluator.name = (context.aRecord.children.Count + 1).ToString("D2") + evaluator.name;

                evaluator.settings.limitIterations        = I_max;
                evaluator.settings.limitTotalPageLoad     = PL_max;
                evaluator.settings.limitIterationNewLinks = LT_t;
                //evaluator.settings.primaryPageSetSize = PS_c;


                if (evaluator is ISpiderWithLanguageModule)
                {
                    ISpiderWithLanguageModule evaluator_ISpiderWithLanguageModule = (ISpiderWithLanguageModule)evaluator;
                    evaluator_ISpiderWithLanguageModule.primaryLanguage   = primLanguage;
                    evaluator_ISpiderWithLanguageModule.secondaryLanguage = secLanguage;
                }


                evaluator.name = evaluator.name + "_" + instanceNameSufix;
                output.log("Crawler [" + evaluator.name + "] iteration limit set [" + evaluator.settings.limitIterations + "], total page load limit set [" + evaluator.settings.limitTotalPageLoad + "], links take limit set [" + evaluator.settings.limitIterationNewLinks + "]");


                context.crawler = evaluator;
                context.aRecord.spiderList.Add(evaluator);

                var tRecord = context.aRecord.children.Add(evaluator);
                tRecord.parent       = context.aRecord;
                tRecord.instanceID   = evaluator.name;
                tRecord.testRunStamp = context.aRecord.testRunStamp;
                tRecord.initialize(context.stageControl);

                output.log("Crawler [" + evaluator.name + "] assigned to job [" + context.aRecord.job.name + "] on slot [" + (context.aRecord.spiderList.Count - 1) + "]");
                output.log("Crawler [" + evaluator.name + "] set as the current crawler in the console state");
            }
            else
            {
                output.log("Crawler class [" + classname + "] not recognized!!!");
            }
        }
 /// <summary>
 /// Returns a basic language object with loaded dictionary file
 /// </summary>
 /// <param name="languageID">The language identifier.</param>
 /// <returns></returns>
 public static basicLanguage GetBasicLanguage(basicLanguageEnum languageID)
 {
     if (basicLanguageRegistry.ContainsKey(languageID))
     {
         basicLanguage output = basicLanguageRegistry[languageID];
         output.checkHuspell();
         return(output);
     }
     return(null);
 }
        /// <summary>
        /// Initializes a new instance of the <see cref="pipelinePageLanguageFilterNode" /> class.
        /// </summary>
        /// <param name="evaluationSettings">The evaluation settings.</param>
        /// <param name="__languages">The languages.</param>
        /// <param name="__primLanguage">The prim language.</param>
        /// <param name="limitPageCount">The limit page count - it will only allow positivly evaluated pages to reach specified count.</param>
        public pipelinePageLanguageFilterNode(multiLanguageEvaluationTask evaluationSettings, List <basicLanguageEnum> __languages, basicLanguageEnum __primLanguage, Int32 limitPageCount)
        {
            _nodeType = pipelineNodeTypeEnum.distributor;

            languages       = __languages;
            languagePrimary = __primLanguage;

            mLanguageEval = new multiLanguageEvaluator();
            mLanguageEval.setup(languages);

            limitValidPageCount = limitPageCount;
            settings            = evaluationSettings;
        }
示例#4
0
        /// <summary>
        /// Initializes a new instance of the <see cref="pipelineTokenLanguageFilterNode"/> class.
        /// </summary>
        public pipelineTokenLanguageFilterNode(multiLanguageEvaluationTask evaluationSettings, List <basicLanguageEnum> __languages, basicLanguageEnum __primLanguage)
        {
            _nodeType = pipelineNodeTypeEnum.distributor;

            languages       = __languages;
            languagePrimary = __primLanguage;

            mLanguageEval = new multiLanguageEvaluator();
            mLanguageEval.setup(languages);


            settings = evaluationSettings;
        }
示例#5
0
        /// <summary>
        /// Returns a basic language object with loaded dictionary file
        /// </summary>
        /// <param name="languageID">The language identifier.</param>
        /// <returns></returns>
        public static basicLanguage GetBasicLanguage(basicLanguageEnum languageID)
        {
            basicLanguage language = new basicLanguage();

            language.affixFilePath       = languageDataSet[languageID][basicLanguageParameterEnum.affixPath];
            language.dictFilePath        = languageDataSet[languageID][basicLanguageParameterEnum.dictPath];
            language.languageNativeName  = languageDataSet[languageID][basicLanguageParameterEnum.nativeName];
            language.languageEnglishName = languageDataSet[languageID][basicLanguageParameterEnum.englishName];
            language.iso2Code            = languageDataSet[languageID][basicLanguageParameterEnum.iso2code];
            language.langIDNeedles.AddRange(languageDataSet[languageID][basicLanguageParameterEnum.needles].Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries));
            language.checkHuspell(true);

            return(language);
        }
示例#6
0
        /// <summary>
        /// Evaluates already defined evaluation task
        /// </summary>
        /// <param name="task">The task.</param>
        /// <param name="processedTokens">Externally preprocessed tokens - if not supplied it will call <see cref="GetAllProperTokensSortedByFrequency(IEnumerable{string}, int, List{string})"/> automatically</param>
        /// <returns></returns>
        public multiLanguageEvaluation evaluate(multiLanguageEvaluationTask task, List <string> processedTokens = null)
        {
            multiLanguageEvaluation evaluation = new multiLanguageEvaluation();

            evaluation.task = task;

            if (processedTokens != null)
            {
                evaluation.allContentTokens = processedTokens;
            }
            else
            {
                evaluation.allContentTokens = GetAllProperTokensSortedByFrequency(task.input_contentTokens, task.tokenLengthMin, task.input_ignoredTokens);
            }
            // <----- test cycle
            bool continueTest = true;
            int  validTests   = 0;
            int  i            = 0;

            foreach (string token in evaluation.allContentTokens)
            {
                basicLanguageEnum matchLanguage = basicLanguageEnum.unknown;
                bool isMultiLanguageMatch       = false;
                bool isNewUnknownWord           = true;

                if (unknownWords.Contains(token))
                {
                    isNewUnknownWord = false;
                }
                else
                {
                    foreach (var pair in languages)
                    {
                        if (pair.Value.isKnownWord(token))
                        {
                            if (matchLanguage == basicLanguageEnum.unknown)
                            {
                                matchLanguage = pair.Key;
                            }
                            else
                            {
                                isMultiLanguageMatch = true;
                                break;
                            }
                        }
                    }
                }

                if (matchLanguage == basicLanguageEnum.serbianCyr)
                {
                    matchLanguage = basicLanguageEnum.serbian;
                }

                if (matchLanguage == basicLanguageEnum.unknown)
                {
                    // <---- no language match
                    if (isNewUnknownWord)
                    {
                        unknownWords.Add(token);
                    }

                    evaluation.noLanguageTokens.Add(token);
                }
                else
                {
                    if (isMultiLanguageMatch)
                    {
                        evaluation.multiLanguageTokens.Add(token);
                    }
                    else
                    {
                        validTests++;
                        evaluation.singleLanguageTokens.Add(token);
                        evaluation.languageScore.AddInstance(matchLanguage);
                    }
                }

                evaluation.allTestedTokens.Add(token);

                i++;

                if (i >= task.testTokenLimit)
                {
                    continueTest       = false;
                    evaluation.comment = evaluation.comment.addLine("Test limit was reached: " + i.ToString());
                }
                if (validTests >= task.validTokenTarget)
                {
                    continueTest       = false;
                    evaluation.comment = evaluation.comment.addLine("Valid tokens target reached: " + validTests.ToString() + " after " + i.ToString() + " tests");
                }

                if (!continueTest)
                {
                    break;
                }
            }
            // <----------------------------- end of test cycle

            // <----- eval results

            if (evaluation.languageScore.Count == 0)
            {
                evaluation.comment         = evaluation.comment.addLine("None of tokens were recognized by languages used.");
                evaluation.result_language = basicLanguageEnum.unknown;
            }
            else
            {
                var langSorted = evaluation.languageScore.getSorted();
                evaluation.result_language = langSorted.First();

                var builder = new System.Text.StringBuilder();
                builder.Append(evaluation.languageScoreList);

                foreach (basicLanguageEnum id in langSorted)
                {
                    builder.Append(string.Format("{0,20} : {1}", id.ToString(), evaluation.languageScore[id].ToString()) + Environment.NewLine);
                }
                evaluation.languageScoreList = builder.ToString();

                evaluation.result_ratio = (double)evaluation.languageScore[evaluation.result_language] / (double)evaluation.singleLanguageTokens.Count;
            }

            foreach (basicLanguageEnum k in evaluation.languageScore.Keys)
            {
                evaluation.languageEnums.AddUnique(k);
            }

            return(evaluation);
        }