private List <MatchResult> MatchEntitiesWithIndicesPostTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3)
        {
            // initial match (reduce database)
            var initialMatchResult = MatchEntitiesWithoutIndices(processedDataset, dataset, inputSentence, ngramSize);

            if (initialMatchResult.Count == 0)
            {
                return(initialMatchResult);
            }

            // get initial match sentences TFIDF values
            var matchingSentencesIndices        = initialMatchResult.Select(m => m.DatabaseMatchInfo.MatchIndex).ToList();
            var initialMatchTFIDFMatrix         = processedDataset.TFIDFMatrix.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray();
            var initialMatchTFIDFAbsoluteValues = processedDataset.TFIDFMatrixAbsoluteValues.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray();
            var initialMatchAsDataset           = dataset.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToList();

            // get all possible tokens of input sentence
            var sentenceTokens         = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize);
            var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize);

            // re-matching (with resolution)
            var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, initialMatchTFIDFMatrix, matrix2Abs: initialMatchTFIDFAbsoluteValues);

            // re-filter
            var tfidfThreshold = 0.5f;
            var tfidfMatches   = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, initialMatchAsDataset, sentenceTokens, tfidfThreshold);

            //post processing
            var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches);

            return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold));
        }
        private List <MatchResult> MatchEntitiesWithoutIndices(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, int ngramSize)
        {
            // calculate input sentence TFIDF vector
            var inputSentenceTFIDFVector = TFIDFController.CalculateInputSentenceTFIDFVector(inputSentence, processedDataset, ngramSize);

            // calculate cosine similarity
            var cosineSimilarityValues = DotProductCalculator.GetDotProduct(inputSentenceTFIDFVector, processedDataset.TFIDFMatrix, matrixAbs: processedDataset.TFIDFMatrixAbsoluteValues);

            // filter result
            var tfidfThreshold = 0.4f;

            return(MatchFilter.FilterByThreshold(cosineSimilarityValues, dataset, tfidfThreshold));
        }
        private List <MatchResult> MatchEntitiesWithIndicesPreTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3)
        {
            // get all input sentence possible tokens
            var sentenceTokens = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize);

            // calculate tokens TFIDF matrix
            var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize);

            // calculate tokens cosine similarity
            var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, processedDataset.TFIDFMatrix, matrix2Abs: processedDataset.TFIDFMatrixAbsoluteValues);

            // filter results
            var tfidfThreshold = 0.5f;
            var tfidfMatches   = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, dataset, sentenceTokens, tfidfThreshold);

            // post processing
            var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches);

            return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold));
        }
        public ProcessedDataset ProcessDataset(List <string> dataset, int ngramSize = default)
        {
            // calculate TF, IDF, and TFIDF values
            TFIDFController.CalculateDatasetTFIDFValues(dataset, out string[] uniqueNGramsVector, out float[] datasetIDFVector, out float[][] datasetTFIDFMatrix, ngramSize: ngramSize);

            // get TFIDF scalar values for each sentence
            var dataseetTFIDFMatrixAbsoluteValues = ScalarValueCalculator.CalculateVectorAbsoluteValueBatch(datasetTFIDFMatrix);

            // calculate maximum number of words in a datapoint within the dataset
            var maximumWordCount = StringTokenizer.FindMaxWordCount(dataset);

            // return
            return(new ProcessedDataset()
            {
                TFIDFMatrixAbsoluteValues = dataseetTFIDFMatrixAbsoluteValues,
                TFIDFMatrix = datasetTFIDFMatrix,
                IDFVector = datasetIDFVector,
                UniqueNGramsVector = uniqueNGramsVector,
                MaximumWordCount = maximumWordCount,
            });
        }