public TokenizationTestResults RunAgainstTestData(List <TokenizerTestData> dataPoints) { var result = new TokenizationTestResults(); foreach (var dataPoint in dataPoints) { var sentence = dataPoint.GetCleanSentence(); var computedPositions = TokenizePositions(sentence); var correctPositions = dataPoint.GetSpans(); var nbOfCorrectTokenizations = computedPositions.Intersect(correctPositions).Count(); var nbOfIncorrectTokenizations = correctPositions.Except(computedPositions).Count(); // count the number of tokens due to whitespaces (not relevant for the accuracy of the model) var nbOfWhiteSpaceTokens = dataPoint.GetNumberOfWhitespaceOccurencesInSentence() + 1; result.NbOfCorrectTokenizations += Math.Max(nbOfCorrectTokenizations - nbOfWhiteSpaceTokens, 0); result.NbOfIncorrectTokenizations += nbOfIncorrectTokenizations; } return(result); }
public TokenizationTestResults RunAgainstTestData(List<TokenizerTestData> dataPoints) { var result = new TokenizationTestResults(); foreach (var dataPoint in dataPoints) { var sentence = dataPoint.GetCleanSentence(); var computedPositions = TokenizePositions(sentence); var correctPositions = dataPoint.GetSpans(); var nbOfCorrectTokenizations = computedPositions.Intersect(correctPositions).Count(); var nbOfIncorrectTokenizations = correctPositions.Except(computedPositions).Count(); // count the number of tokens due to whitespaces (not relevant for the accuracy of the model) var nbOfWhiteSpaceTokens = dataPoint.GetNumberOfWhitespaceOccurencesInSentence() + 1; result.NbOfCorrectTokenizations += Math.Max(nbOfCorrectTokenizations - nbOfWhiteSpaceTokens, 0); result.NbOfIncorrectTokenizations += nbOfIncorrectTokenizations; } return result; }