Пример #1
0
        public TokenizationTestResults RunAgainstTestData(List <TokenizerTestData> dataPoints)
        {
            var result = new TokenizationTestResults();

            foreach (var dataPoint in dataPoints)
            {
                var sentence          = dataPoint.GetCleanSentence();
                var computedPositions = TokenizePositions(sentence);
                var correctPositions  = dataPoint.GetSpans();

                var nbOfCorrectTokenizations   = computedPositions.Intersect(correctPositions).Count();
                var nbOfIncorrectTokenizations = correctPositions.Except(computedPositions).Count();
                // count the number of tokens due to whitespaces (not relevant for the accuracy of the model)
                var nbOfWhiteSpaceTokens = dataPoint.GetNumberOfWhitespaceOccurencesInSentence() + 1;
                result.NbOfCorrectTokenizations   += Math.Max(nbOfCorrectTokenizations - nbOfWhiteSpaceTokens, 0);
                result.NbOfIncorrectTokenizations += nbOfIncorrectTokenizations;
            }

            return(result);
        }
Пример #2
0
        public TokenizationTestResults RunAgainstTestData(List<TokenizerTestData> dataPoints)
        {
            var result = new TokenizationTestResults();

            foreach (var dataPoint in dataPoints)
            {
                var sentence = dataPoint.GetCleanSentence();
                var computedPositions = TokenizePositions(sentence);
                var correctPositions = dataPoint.GetSpans();

                var nbOfCorrectTokenizations = computedPositions.Intersect(correctPositions).Count();
                var nbOfIncorrectTokenizations = correctPositions.Except(computedPositions).Count();
                // count the number of tokens due to whitespaces (not relevant for the accuracy of the model)
                var nbOfWhiteSpaceTokens = dataPoint.GetNumberOfWhitespaceOccurencesInSentence() + 1;
                result.NbOfCorrectTokenizations += Math.Max(nbOfCorrectTokenizations - nbOfWhiteSpaceTokens, 0);
                result.NbOfIncorrectTokenizations += nbOfIncorrectTokenizations;
            }

            return result;
        }