/// <summary> /// Evaluates the specified content non-unique and non filtered tokens /// </summary> /// <param name="contentTokens">The content tokens to evaluate</param> /// <param name="ignoreTokens">The ignore tokens to skip from evaluation</param> /// <returns>Results of evaluation</returns> public multiLanguageEvaluation evaluate(List <string> contentTokens, List <string> ignoreTokens = null, List <string> processedTokens = null) { if (ignoreTokens == null) { ignoreTokens = new List <string>(); } multiLanguageEvaluationTask task = new multiLanguageEvaluationTask(); task.testLanguages.AddRange(languages.Keys); task.tokenLengthMin = tokenLengthMin; task.validTokenTarget = validTokenTarget; task.input_contentTokens = contentTokens; task.input_ignoredTokens = ignoreTokens; task.testTokenLimit = testTokenLimit; return(evaluate(task, processedTokens)); }
/// <summary> /// Evaluates already defined evaluation task /// </summary> /// <param name="task">The task.</param> /// <param name="processedTokens">Externally preprocessed tokens - if not supplied it will call <see cref="GetAllProperTokensSortedByFrequency(IEnumerable{string}, int, List{string})"/> automatically</param> /// <returns></returns> public multiLanguageEvaluation evaluate(multiLanguageEvaluationTask task, List <string> processedTokens = null) { multiLanguageEvaluation evaluation = new multiLanguageEvaluation(); evaluation.task = task; if (processedTokens != null) { evaluation.allContentTokens = processedTokens; } else { evaluation.allContentTokens = GetAllProperTokensSortedByFrequency(task.input_contentTokens, task.tokenLengthMin, task.input_ignoredTokens); } // <----- test cycle bool continueTest = true; int validTests = 0; int i = 0; foreach (string token in evaluation.allContentTokens) { basicLanguageEnum matchLanguage = basicLanguageEnum.unknown; bool isMultiLanguageMatch = false; bool isNewUnknownWord = true; if (unknownWords.Contains(token)) { isNewUnknownWord = false; } else { foreach (var pair in languages) { if (pair.Value.isKnownWord(token)) { if (matchLanguage == basicLanguageEnum.unknown) { matchLanguage = pair.Key; } else { isMultiLanguageMatch = true; break; } } } } if (matchLanguage == basicLanguageEnum.serbianCyr) { matchLanguage = basicLanguageEnum.serbian; } if (matchLanguage == basicLanguageEnum.unknown) { // <---- no language match if (isNewUnknownWord) { unknownWords.Add(token); } evaluation.noLanguageTokens.Add(token); } else { if (isMultiLanguageMatch) { evaluation.multiLanguageTokens.Add(token); } else { validTests++; evaluation.singleLanguageTokens.Add(token); evaluation.languageScore.AddInstance(matchLanguage); } } evaluation.allTestedTokens.Add(token); i++; if (i >= task.testTokenLimit) { continueTest = false; evaluation.comment = evaluation.comment.addLine("Test limit was reached: " + i.ToString()); } if (validTests >= task.validTokenTarget) { continueTest = false; evaluation.comment = evaluation.comment.addLine("Valid tokens target reached: " + validTests.ToString() + " after " + i.ToString() + " tests"); } if (!continueTest) { break; } } // <----------------------------- end of test cycle // <----- eval results if (evaluation.languageScore.Count == 0) { evaluation.comment = evaluation.comment.addLine("None of tokens were recognized by languages used."); evaluation.result_language = basicLanguageEnum.unknown; } else { var langSorted = evaluation.languageScore.getSorted(); evaluation.result_language = langSorted.First(); var builder = new System.Text.StringBuilder(); builder.Append(evaluation.languageScoreList); foreach (basicLanguageEnum id in langSorted) { builder.Append(string.Format("{0,20} : {1}", id.ToString(), evaluation.languageScore[id].ToString()) + Environment.NewLine); } evaluation.languageScoreList = builder.ToString(); evaluation.result_ratio = (double)evaluation.languageScore[evaluation.result_language] / (double)evaluation.singleLanguageTokens.Count; } foreach (basicLanguageEnum k in evaluation.languageScore.Keys) { evaluation.languageEnums.AddUnique(k); } return(evaluation); }