private List <MatchResult> MatchEntitiesWithIndicesPostTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3) { // initial match (reduce database) var initialMatchResult = MatchEntitiesWithoutIndices(processedDataset, dataset, inputSentence, ngramSize); if (initialMatchResult.Count == 0) { return(initialMatchResult); } // get initial match sentences TFIDF values var matchingSentencesIndices = initialMatchResult.Select(m => m.DatabaseMatchInfo.MatchIndex).ToList(); var initialMatchTFIDFMatrix = processedDataset.TFIDFMatrix.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray(); var initialMatchTFIDFAbsoluteValues = processedDataset.TFIDFMatrixAbsoluteValues.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray(); var initialMatchAsDataset = dataset.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToList(); // get all possible tokens of input sentence var sentenceTokens = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize); var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize); // re-matching (with resolution) var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, initialMatchTFIDFMatrix, matrix2Abs: initialMatchTFIDFAbsoluteValues); // re-filter var tfidfThreshold = 0.5f; var tfidfMatches = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, initialMatchAsDataset, sentenceTokens, tfidfThreshold); //post processing var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches); return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold)); }
private List <MatchResult> MatchEntitiesWithoutIndices(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, int ngramSize) { // calculate input sentence TFIDF vector var inputSentenceTFIDFVector = TFIDFController.CalculateInputSentenceTFIDFVector(inputSentence, processedDataset, ngramSize); // calculate cosine similarity var cosineSimilarityValues = DotProductCalculator.GetDotProduct(inputSentenceTFIDFVector, processedDataset.TFIDFMatrix, matrixAbs: processedDataset.TFIDFMatrixAbsoluteValues); // filter result var tfidfThreshold = 0.4f; return(MatchFilter.FilterByThreshold(cosineSimilarityValues, dataset, tfidfThreshold)); }
private List <MatchResult> MatchEntitiesWithIndicesPreTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3) { // get all input sentence possible tokens var sentenceTokens = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize); // calculate tokens TFIDF matrix var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize); // calculate tokens cosine similarity var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, processedDataset.TFIDFMatrix, matrix2Abs: processedDataset.TFIDFMatrixAbsoluteValues); // filter results var tfidfThreshold = 0.5f; var tfidfMatches = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, dataset, sentenceTokens, tfidfThreshold); // post processing var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches); return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold)); }
public ProcessedDataset ProcessDataset(List <string> dataset, int ngramSize = default) { // calculate TF, IDF, and TFIDF values TFIDFController.CalculateDatasetTFIDFValues(dataset, out string[] uniqueNGramsVector, out float[] datasetIDFVector, out float[][] datasetTFIDFMatrix, ngramSize: ngramSize); // get TFIDF scalar values for each sentence var dataseetTFIDFMatrixAbsoluteValues = ScalarValueCalculator.CalculateVectorAbsoluteValueBatch(datasetTFIDFMatrix); // calculate maximum number of words in a datapoint within the dataset var maximumWordCount = StringTokenizer.FindMaxWordCount(dataset); // return return(new ProcessedDataset() { TFIDFMatrixAbsoluteValues = dataseetTFIDFMatrixAbsoluteValues, TFIDFMatrix = datasetTFIDFMatrix, IDFVector = datasetIDFVector, UniqueNGramsVector = uniqueNGramsVector, MaximumWordCount = maximumWordCount, }); }