/// <summary>Defines new instance of the specified crawler. LT_t defines link take per iteration, I_max is iteration limit, PL_max defines max. page loads, PS_c is count of selected pages at end.</summary> /// <remarks><para>New crawler is attached to the AnalyticJobRecord and set as current on the state level</para></remarks> /// <param name="classname">Name of the crawler class</param> /// <param name="LT_t">Load take - number of parallel loads</param> /// <param name="I_max">Iteration number limit</param> /// <param name="PL_max">Page Loads limit</param> /// <param name="instanceNameSufix">Crawler name sufix</param> /// <seealso cref="aceOperationSetExecutorBase"/> public void aceOperation_addCrawler( [Description("Name of the crawler class")] String classname = "SM_LTS", [Description("Load take - number of parallel loads")] Int32 LT_t = 1, [Description("Iteration number limit")] Int32 I_max = 100, [Description("Page Loads limit")] Int32 PL_max = 50, [Description("Crawler name sufix")] String instanceNameSufix = "", [Description("Primary language")] basicLanguageEnum primLanguage = basicLanguageEnum.serbian, [Description("Secondary language")] basicLanguageEnum secLanguage = basicLanguageEnum.english ) { if (context.aRecord == null) { output.log("Error: define Job before calling this command."); return; } var evaluator = wemTypesManager.wemTypes.crawlerTypes.GetInstance(classname, output); if (evaluator != null) { evaluator.name = (context.aRecord.children.Count + 1).ToString("D2") + evaluator.name; evaluator.settings.limitIterations = I_max; evaluator.settings.limitTotalPageLoad = PL_max; evaluator.settings.limitIterationNewLinks = LT_t; //evaluator.settings.primaryPageSetSize = PS_c; if (evaluator is ISpiderWithLanguageModule) { ISpiderWithLanguageModule evaluator_ISpiderWithLanguageModule = (ISpiderWithLanguageModule)evaluator; evaluator_ISpiderWithLanguageModule.primaryLanguage = primLanguage; evaluator_ISpiderWithLanguageModule.secondaryLanguage = secLanguage; } evaluator.name = evaluator.name + "_" + instanceNameSufix; output.log("Crawler [" + evaluator.name + "] iteration limit set [" + evaluator.settings.limitIterations + "], total page load limit set [" + evaluator.settings.limitTotalPageLoad + "], links take limit set [" + evaluator.settings.limitIterationNewLinks + "]"); context.crawler = evaluator; context.aRecord.spiderList.Add(evaluator); var tRecord = context.aRecord.children.Add(evaluator); tRecord.parent = context.aRecord; tRecord.instanceID = evaluator.name; tRecord.testRunStamp = context.aRecord.testRunStamp; tRecord.initialize(context.stageControl); output.log("Crawler [" + evaluator.name + "] assigned to job [" + context.aRecord.job.name + "] on slot [" + (context.aRecord.spiderList.Count - 1) + "]"); output.log("Crawler [" + evaluator.name + "] set as the current crawler in the console state"); } else { output.log("Crawler class [" + classname + "] not recognized!!!"); } }
/// <summary> /// Returns a basic language object with loaded dictionary file /// </summary> /// <param name="languageID">The language identifier.</param> /// <returns></returns> public static basicLanguage GetBasicLanguage(basicLanguageEnum languageID) { if (basicLanguageRegistry.ContainsKey(languageID)) { basicLanguage output = basicLanguageRegistry[languageID]; output.checkHuspell(); return(output); } return(null); }
/// <summary> /// Initializes a new instance of the <see cref="pipelinePageLanguageFilterNode" /> class. /// </summary> /// <param name="evaluationSettings">The evaluation settings.</param> /// <param name="__languages">The languages.</param> /// <param name="__primLanguage">The prim language.</param> /// <param name="limitPageCount">The limit page count - it will only allow positivly evaluated pages to reach specified count.</param> public pipelinePageLanguageFilterNode(multiLanguageEvaluationTask evaluationSettings, List <basicLanguageEnum> __languages, basicLanguageEnum __primLanguage, Int32 limitPageCount) { _nodeType = pipelineNodeTypeEnum.distributor; languages = __languages; languagePrimary = __primLanguage; mLanguageEval = new multiLanguageEvaluator(); mLanguageEval.setup(languages); limitValidPageCount = limitPageCount; settings = evaluationSettings; }
/// <summary> /// Initializes a new instance of the <see cref="pipelineTokenLanguageFilterNode"/> class. /// </summary> public pipelineTokenLanguageFilterNode(multiLanguageEvaluationTask evaluationSettings, List <basicLanguageEnum> __languages, basicLanguageEnum __primLanguage) { _nodeType = pipelineNodeTypeEnum.distributor; languages = __languages; languagePrimary = __primLanguage; mLanguageEval = new multiLanguageEvaluator(); mLanguageEval.setup(languages); settings = evaluationSettings; }
/// <summary> /// Returns a basic language object with loaded dictionary file /// </summary> /// <param name="languageID">The language identifier.</param> /// <returns></returns> public static basicLanguage GetBasicLanguage(basicLanguageEnum languageID) { basicLanguage language = new basicLanguage(); language.affixFilePath = languageDataSet[languageID][basicLanguageParameterEnum.affixPath]; language.dictFilePath = languageDataSet[languageID][basicLanguageParameterEnum.dictPath]; language.languageNativeName = languageDataSet[languageID][basicLanguageParameterEnum.nativeName]; language.languageEnglishName = languageDataSet[languageID][basicLanguageParameterEnum.englishName]; language.iso2Code = languageDataSet[languageID][basicLanguageParameterEnum.iso2code]; language.langIDNeedles.AddRange(languageDataSet[languageID][basicLanguageParameterEnum.needles].Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries)); language.checkHuspell(true); return(language); }
/// <summary> /// Evaluates already defined evaluation task /// </summary> /// <param name="task">The task.</param> /// <param name="processedTokens">Externally preprocessed tokens - if not supplied it will call <see cref="GetAllProperTokensSortedByFrequency(IEnumerable{string}, int, List{string})"/> automatically</param> /// <returns></returns> public multiLanguageEvaluation evaluate(multiLanguageEvaluationTask task, List <string> processedTokens = null) { multiLanguageEvaluation evaluation = new multiLanguageEvaluation(); evaluation.task = task; if (processedTokens != null) { evaluation.allContentTokens = processedTokens; } else { evaluation.allContentTokens = GetAllProperTokensSortedByFrequency(task.input_contentTokens, task.tokenLengthMin, task.input_ignoredTokens); } // <----- test cycle bool continueTest = true; int validTests = 0; int i = 0; foreach (string token in evaluation.allContentTokens) { basicLanguageEnum matchLanguage = basicLanguageEnum.unknown; bool isMultiLanguageMatch = false; bool isNewUnknownWord = true; if (unknownWords.Contains(token)) { isNewUnknownWord = false; } else { foreach (var pair in languages) { if (pair.Value.isKnownWord(token)) { if (matchLanguage == basicLanguageEnum.unknown) { matchLanguage = pair.Key; } else { isMultiLanguageMatch = true; break; } } } } if (matchLanguage == basicLanguageEnum.serbianCyr) { matchLanguage = basicLanguageEnum.serbian; } if (matchLanguage == basicLanguageEnum.unknown) { // <---- no language match if (isNewUnknownWord) { unknownWords.Add(token); } evaluation.noLanguageTokens.Add(token); } else { if (isMultiLanguageMatch) { evaluation.multiLanguageTokens.Add(token); } else { validTests++; evaluation.singleLanguageTokens.Add(token); evaluation.languageScore.AddInstance(matchLanguage); } } evaluation.allTestedTokens.Add(token); i++; if (i >= task.testTokenLimit) { continueTest = false; evaluation.comment = evaluation.comment.addLine("Test limit was reached: " + i.ToString()); } if (validTests >= task.validTokenTarget) { continueTest = false; evaluation.comment = evaluation.comment.addLine("Valid tokens target reached: " + validTests.ToString() + " after " + i.ToString() + " tests"); } if (!continueTest) { break; } } // <----------------------------- end of test cycle // <----- eval results if (evaluation.languageScore.Count == 0) { evaluation.comment = evaluation.comment.addLine("None of tokens were recognized by languages used."); evaluation.result_language = basicLanguageEnum.unknown; } else { var langSorted = evaluation.languageScore.getSorted(); evaluation.result_language = langSorted.First(); var builder = new System.Text.StringBuilder(); builder.Append(evaluation.languageScoreList); foreach (basicLanguageEnum id in langSorted) { builder.Append(string.Format("{0,20} : {1}", id.ToString(), evaluation.languageScore[id].ToString()) + Environment.NewLine); } evaluation.languageScoreList = builder.ToString(); evaluation.result_ratio = (double)evaluation.languageScore[evaluation.result_language] / (double)evaluation.singleLanguageTokens.Count; } foreach (basicLanguageEnum k in evaluation.languageScore.Keys) { evaluation.languageEnums.AddUnique(k); } return(evaluation); }