private List <MatchResult> MatchEntitiesWithIndicesPostTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3) { // initial match (reduce database) var initialMatchResult = MatchEntitiesWithoutIndices(processedDataset, dataset, inputSentence, ngramSize); if (initialMatchResult.Count == 0) { return(initialMatchResult); } // get initial match sentences TFIDF values var matchingSentencesIndices = initialMatchResult.Select(m => m.DatabaseMatchInfo.MatchIndex).ToList(); var initialMatchTFIDFMatrix = processedDataset.TFIDFMatrix.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray(); var initialMatchTFIDFAbsoluteValues = processedDataset.TFIDFMatrixAbsoluteValues.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray(); var initialMatchAsDataset = dataset.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToList(); // get all possible tokens of input sentence var sentenceTokens = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize); var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize); // re-matching (with resolution) var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, initialMatchTFIDFMatrix, matrix2Abs: initialMatchTFIDFAbsoluteValues); // re-filter var tfidfThreshold = 0.5f; var tfidfMatches = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, initialMatchAsDataset, sentenceTokens, tfidfThreshold); //post processing var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches); return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold)); }
public void TokenGenerationTest(List <List <TokenMatchInfo> > wordListTestset, List <List <TokenMatchInfo> > expected, int maxWordCount, int ngrams) { Console.WriteLine("======================"); // for each element in the testing set foreach (var sentenceWords in wordListTestset) { var tokenList = StringTokenizer.GetAllPossibleTokens(sentenceWords, maxWordCount, ngrams); foreach (var token in tokenList) { Console.WriteLine(token.TokenText); Console.WriteLine(token.StartIndex); Console.WriteLine(token.EndIndex); Console.WriteLine("======================"); } } }
private List <MatchResult> MatchEntitiesWithIndicesPreTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3) { // get all input sentence possible tokens var sentenceTokens = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize); // calculate tokens TFIDF matrix var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize); // calculate tokens cosine similarity var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, processedDataset.TFIDFMatrix, matrix2Abs: processedDataset.TFIDFMatrixAbsoluteValues); // filter results var tfidfThreshold = 0.5f; var tfidfMatches = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, dataset, sentenceTokens, tfidfThreshold); // post processing var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches); return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold)); }