private List <MatchResult> MatchEntitiesWithIndicesPostTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3) { // initial match (reduce database) var initialMatchResult = MatchEntitiesWithoutIndices(processedDataset, dataset, inputSentence, ngramSize); if (initialMatchResult.Count == 0) { return(initialMatchResult); } // get initial match sentences TFIDF values var matchingSentencesIndices = initialMatchResult.Select(m => m.DatabaseMatchInfo.MatchIndex).ToList(); var initialMatchTFIDFMatrix = processedDataset.TFIDFMatrix.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray(); var initialMatchTFIDFAbsoluteValues = processedDataset.TFIDFMatrixAbsoluteValues.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray(); var initialMatchAsDataset = dataset.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToList(); // get all possible tokens of input sentence var sentenceTokens = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize); var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize); // re-matching (with resolution) var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, initialMatchTFIDFMatrix, matrix2Abs: initialMatchTFIDFAbsoluteValues); // re-filter var tfidfThreshold = 0.5f; var tfidfMatches = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, initialMatchAsDataset, sentenceTokens, tfidfThreshold); //post processing var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches); return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold)); }
private List <MatchResult> MatchEntitiesWithoutIndices(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, int ngramSize) { // calculate input sentence TFIDF vector var inputSentenceTFIDFVector = TFIDFController.CalculateInputSentenceTFIDFVector(inputSentence, processedDataset, ngramSize); // calculate cosine similarity var cosineSimilarityValues = DotProductCalculator.GetDotProduct(inputSentenceTFIDFVector, processedDataset.TFIDFMatrix, matrixAbs: processedDataset.TFIDFMatrixAbsoluteValues); // filter result var tfidfThreshold = 0.4f; return(MatchFilter.FilterByThreshold(cosineSimilarityValues, dataset, tfidfThreshold)); }
public static float[] CalculateInputSentenceTFIDFVector(string sentence, ProcessedDataset processedDataset, int ngramSize) { // calculate ngrams for the sentence var ngSize = ngramSize == default ? NGramSize : ngramSize; var sentenceNGrams = NGramsController.GetSentenceNGrams(sentence, ngSize); // calculate ngrams frequencies var sentenceNGramFrequencies = NGramFrequencyController.GetNGramFrequency(sentenceNGrams); // calculate TF vector var sentenceTFVectorDataset = TFController.CalculateTFVector(sentenceNGramFrequencies, processedDataset.UniqueNGramsVector); // calculate TF-IDF vector return(MultiplicationCalculator.MultiplyVectorsByCell(sentenceTFVectorDataset, processedDataset.IDFVector)); }
public List <MatchResult> MatchEntities(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, MatchingMethod matchingMethod, float threshold = default, int ngramSize = default) { var matchingThreshold = threshold == default ? PostProcessingThreshold : threshold; switch (matchingMethod) { case MatchingMethod.NoMatchIndices: return(MatchEntitiesWithoutIndices(processedDataset, dataset, inputSentence, ngramSize)); case MatchingMethod.PreprocessInputSentence: return(MatchEntitiesWithIndicesPreTokenizeApproach(processedDataset, dataset, inputSentence, matchingThreshold, ngramSize)); case MatchingMethod.PostprocessInputSentence: return(MatchEntitiesWithIndicesPostTokenizeApproach(processedDataset, dataset, inputSentence, matchingThreshold, ngramSize)); default: throw new Exception("Matching method not supported!"); } }
private List <MatchResult> MatchEntitiesWithIndicesPreTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3) { // get all input sentence possible tokens var sentenceTokens = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize); // calculate tokens TFIDF matrix var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize); // calculate tokens cosine similarity var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, processedDataset.TFIDFMatrix, matrix2Abs: processedDataset.TFIDFMatrixAbsoluteValues); // filter results var tfidfThreshold = 0.5f; var tfidfMatches = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, dataset, sentenceTokens, tfidfThreshold); // post processing var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches); return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold)); }
public static float[][] CalculateInputSenenteceTokensTFIDFMatrix(List <TokenMatchInfo> sentenceTokens, ProcessedDataset processedDataset, int ngramSize) { // CAN BE MADE MORE EFFICIENT SINCE SOME TOKENS ARE SUBTOKENS OF OTHERS // TODO: Implement more efficient solution that makes use of Sub-tokens // calculate ngrams for tokens var ngSize = ngramSize == default ? NGramSize : ngramSize; var tokensNgrams = NGramsController.GetSentenceNGramsBatch(sentenceTokens, ngSize); // calculate N-Gram Frequencies var inputTokensNGramFrequencies = NGramFrequencyController.GetNGramFrequencyBatch(tokensNgrams); // calculate TF Matrix var inputTokensTFMaxtrixDataset = TFController.CalculateTFVectorBatch(inputTokensNGramFrequencies, processedDataset.UniqueNGramsVector); // calculate TF-IDF Matrix var inputTokensTFIDFMatrixDataset = MultiplicationCalculator.MultiplyVectorsByCellBatch(inputTokensTFMaxtrixDataset, processedDataset.IDFVector); return(inputTokensTFIDFMatrixDataset); }
/// <summary> /// Separate inputs columns from output column (from ExtractedDataset /// table) and convert strings into numeric values. /// </summary> /// <param name="attributeToPredict">Name of the output column.</param> /// <param name="codeBook">Codebook to be used (by default null).</param> /// <returns>Bool value indicating whether the dataset was /// processed correctly or not.</returns> public bool ProcessDataset(string attributeToPredict, Codification codeBook = null) { // ProcessedDataset will have the same structure of ExtractedDataset. ProcessedDataset = ExtractedDataset.Clone(); // Inputs and outputs must preserve ExtractedDataset's dimensions. InputData = new double[ExtractedDataset.Rows.Count][]; OutputData = new int[ExtractedDataset.Rows.Count]; // Except for the output column, columns' types are changed to // double type (classifiers work with numbers, not with strings). foreach (DataColumn column in ExtractedDataset.Columns) { if (column.ColumnName != attributeToPredict) { InputColumnNames.Add(column.ColumnName); ProcessedDataset.Columns[column.Ordinal].DataType = typeof(double); } else { OutputColumnName = column.ColumnName; } } try { // Temporary variables. double tempValue = 0; DataRow processedRow = null; List <double> tempInput = null; for (int row = 0; row < ExtractedDataset.Rows.Count; ++row) { // Process one row at time. processedRow = ProcessedDataset.NewRow(); tempInput = new List <double>(); foreach (DataColumn column in ExtractedDataset.Columns) { if (column.ColumnName != attributeToPredict) { Double.TryParse( ExtractedDataset.Rows[row][column.Ordinal] as string, System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out tempValue); // Create a row of numeric values to be // added to ProcessedDataset and InputData. processedRow[column.Ordinal] = tempValue; tempInput.Add(tempValue); } else { // Don't convert the output column to a number yet, just copy the string // value (conversion to number will be done later by CodeBook). processedRow[column.Ordinal] = ExtractedDataset.Rows[row][column.Ordinal]; } } // Add/fill a row in ProcessedDataset and InputData // before going to next row. ProcessedDataset.Rows.Add(processedRow); InputData[row] = tempInput.ToArray(); } if (codeBook != null) { // Use given codebook (codebook should be given only when dealing // with testing datasets, in order to have the same codebook both // for training and for testing data). this.CodeBook = codeBook; } else { // If no codebook is given, create one for the output column. CodeBook = new Codification(ExtractedDataset, attributeToPredict); } // Apply codebook to the ProcessedDataset. ProcessedDataset = CodeBook.Apply(ProcessedDataset); // InputData is already set, OutputData is set to be the output // column codified with the codebook. OutputData = ProcessedDataset.ToArray <int>(attributeToPredict); // Number of input columns. InputAttributeNumber = ExtractedDataset.Columns.Count - 1; // Number of possible values the output column may assume. OutputPossibleValues = CodeBook[attributeToPredict].Symbols; } catch { return(false); } return(true); }