Esempio n. 1
0
        /// <summary>
        /// Evaluates the specified content non-unique and non filtered tokens
        /// </summary>
        /// <param name="contentTokens">The content tokens to evaluate</param>
        /// <param name="ignoreTokens">The ignore tokens to skip from evaluation</param>
        /// <returns>Results of evaluation</returns>
        public multiLanguageEvaluation evaluate(List <string> contentTokens, List <string> ignoreTokens = null, List <string> processedTokens = null)
        {
            if (ignoreTokens == null)
            {
                ignoreTokens = new List <string>();
            }
            multiLanguageEvaluationTask task = new multiLanguageEvaluationTask();

            task.testLanguages.AddRange(languages.Keys);
            task.tokenLengthMin      = tokenLengthMin;
            task.validTokenTarget    = validTokenTarget;
            task.input_contentTokens = contentTokens;
            task.input_ignoredTokens = ignoreTokens;
            task.testTokenLimit      = testTokenLimit;

            return(evaluate(task, processedTokens));
        }
Esempio n. 2
0
        /// <summary>
        /// Evaluates already defined evaluation task
        /// </summary>
        /// <param name="task">The task.</param>
        /// <param name="processedTokens">Externally preprocessed tokens - if not supplied it will call <see cref="GetAllProperTokensSortedByFrequency(IEnumerable{string}, int, List{string})"/> automatically</param>
        /// <returns></returns>
        public multiLanguageEvaluation evaluate(multiLanguageEvaluationTask task, List <string> processedTokens = null)
        {
            multiLanguageEvaluation evaluation = new multiLanguageEvaluation();

            evaluation.task = task;

            if (processedTokens != null)
            {
                evaluation.allContentTokens = processedTokens;
            }
            else
            {
                evaluation.allContentTokens = GetAllProperTokensSortedByFrequency(task.input_contentTokens, task.tokenLengthMin, task.input_ignoredTokens);
            }
            // <----- test cycle
            bool continueTest = true;
            int  validTests   = 0;
            int  i            = 0;

            foreach (string token in evaluation.allContentTokens)
            {
                basicLanguageEnum matchLanguage = basicLanguageEnum.unknown;
                bool isMultiLanguageMatch       = false;
                bool isNewUnknownWord           = true;

                if (unknownWords.Contains(token))
                {
                    isNewUnknownWord = false;
                }
                else
                {
                    foreach (var pair in languages)
                    {
                        if (pair.Value.isKnownWord(token))
                        {
                            if (matchLanguage == basicLanguageEnum.unknown)
                            {
                                matchLanguage = pair.Key;
                            }
                            else
                            {
                                isMultiLanguageMatch = true;
                                break;
                            }
                        }
                    }
                }

                if (matchLanguage == basicLanguageEnum.serbianCyr)
                {
                    matchLanguage = basicLanguageEnum.serbian;
                }

                if (matchLanguage == basicLanguageEnum.unknown)
                {
                    // <---- no language match
                    if (isNewUnknownWord)
                    {
                        unknownWords.Add(token);
                    }

                    evaluation.noLanguageTokens.Add(token);
                }
                else
                {
                    if (isMultiLanguageMatch)
                    {
                        evaluation.multiLanguageTokens.Add(token);
                    }
                    else
                    {
                        validTests++;
                        evaluation.singleLanguageTokens.Add(token);
                        evaluation.languageScore.AddInstance(matchLanguage);
                    }
                }

                evaluation.allTestedTokens.Add(token);

                i++;

                if (i >= task.testTokenLimit)
                {
                    continueTest       = false;
                    evaluation.comment = evaluation.comment.addLine("Test limit was reached: " + i.ToString());
                }
                if (validTests >= task.validTokenTarget)
                {
                    continueTest       = false;
                    evaluation.comment = evaluation.comment.addLine("Valid tokens target reached: " + validTests.ToString() + " after " + i.ToString() + " tests");
                }

                if (!continueTest)
                {
                    break;
                }
            }
            // <----------------------------- end of test cycle

            // <----- eval results

            if (evaluation.languageScore.Count == 0)
            {
                evaluation.comment         = evaluation.comment.addLine("None of tokens were recognized by languages used.");
                evaluation.result_language = basicLanguageEnum.unknown;
            }
            else
            {
                var langSorted = evaluation.languageScore.getSorted();
                evaluation.result_language = langSorted.First();

                var builder = new System.Text.StringBuilder();
                builder.Append(evaluation.languageScoreList);

                foreach (basicLanguageEnum id in langSorted)
                {
                    builder.Append(string.Format("{0,20} : {1}", id.ToString(), evaluation.languageScore[id].ToString()) + Environment.NewLine);
                }
                evaluation.languageScoreList = builder.ToString();

                evaluation.result_ratio = (double)evaluation.languageScore[evaluation.result_language] / (double)evaluation.singleLanguageTokens.Count;
            }

            foreach (basicLanguageEnum k in evaluation.languageScore.Keys)
            {
                evaluation.languageEnums.AddUnique(k);
            }

            return(evaluation);
        }