public static float[][] CalculateInputSenenteceTokensTFIDFMatrix(List <TokenMatchInfo> sentenceTokens, ProcessedDataset processedDataset, int ngramSize) { // CAN BE MADE MORE EFFICIENT SINCE SOME TOKENS ARE SUBTOKENS OF OTHERS // TODO: Implement more efficient solution that makes use of Sub-tokens // calculate ngrams for tokens var ngSize = ngramSize == default ? NGramSize : ngramSize; var tokensNgrams = NGramsController.GetSentenceNGramsBatch(sentenceTokens, ngSize); // calculate N-Gram Frequencies var inputTokensNGramFrequencies = NGramFrequencyController.GetNGramFrequencyBatch(tokensNgrams); // calculate TF Matrix var inputTokensTFMaxtrixDataset = TFController.CalculateTFVectorBatch(inputTokensNGramFrequencies, processedDataset.UniqueNGramsVector); // calculate TF-IDF Matrix var inputTokensTFIDFMatrixDataset = MultiplicationCalculator.MultiplyVectorsByCellBatch(inputTokensTFMaxtrixDataset, processedDataset.IDFVector); return(inputTokensTFIDFMatrixDataset); }
public static void CalculateDatasetTFIDFValues(List <string> dataset, out string[] uniqueNGramsVector, out float[] datasetIDFVector, out float[][] datasetTFIDFMatrix, int ngramSize = default) { // calculate ngrams for each sentence var ngSize = ngramSize == default ? NGramSize : ngramSize; var datasetNGrams = NGramsController.GetSentenceNGramsBatch(dataset, ngSize); // calculate ngram frequencies var ngramFrequencies = NGramFrequencyController.GetNGramFrequencyBatch(datasetNGrams); var overallNgramFrequencies = NGramFrequencyController.GetOverallNGramFrequency(datasetNGrams).GetAwaiter().GetResult(); // get ngrams feature vector uniqueNGramsVector = overallNgramFrequencies.Keys.ToArray(); // calculate TF var datasetTFMatrix = TFController.CalculateTFVectorBatch(ngramFrequencies, uniqueNGramsVector); // calculate IDF int dataLength = dataset.Count + 1; datasetIDFVector = IDFController.CalculateIDFVector(uniqueNGramsVector, overallNgramFrequencies, dataLength); // calculate TF-IDF datasetTFIDFMatrix = MultiplicationCalculator.MultiplyVectorsByCellBatch(datasetTFMatrix, datasetIDFVector); }