private List <MatchResult> MatchEntitiesWithIndicesPostTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3)
        {
            // initial match (reduce database)
            var initialMatchResult = MatchEntitiesWithoutIndices(processedDataset, dataset, inputSentence, ngramSize);

            if (initialMatchResult.Count == 0)
            {
                return(initialMatchResult);
            }

            // get initial match sentences TFIDF values
            var matchingSentencesIndices        = initialMatchResult.Select(m => m.DatabaseMatchInfo.MatchIndex).ToList();
            var initialMatchTFIDFMatrix         = processedDataset.TFIDFMatrix.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray();
            var initialMatchTFIDFAbsoluteValues = processedDataset.TFIDFMatrixAbsoluteValues.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToArray();
            var initialMatchAsDataset           = dataset.Where((rowValue, rowIndex) => matchingSentencesIndices.Contains(rowIndex)).ToList();

            // get all possible tokens of input sentence
            var sentenceTokens         = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize);
            var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize);

            // re-matching (with resolution)
            var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, initialMatchTFIDFMatrix, matrix2Abs: initialMatchTFIDFAbsoluteValues);

            // re-filter
            var tfidfThreshold = 0.5f;
            var tfidfMatches   = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, initialMatchAsDataset, sentenceTokens, tfidfThreshold);

            //post processing
            var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches);

            return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold));
        }
        private List <MatchResult> MatchEntitiesWithoutIndices(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, int ngramSize)
        {
            // calculate input sentence TFIDF vector
            var inputSentenceTFIDFVector = TFIDFController.CalculateInputSentenceTFIDFVector(inputSentence, processedDataset, ngramSize);

            // calculate cosine similarity
            var cosineSimilarityValues = DotProductCalculator.GetDotProduct(inputSentenceTFIDFVector, processedDataset.TFIDFMatrix, matrixAbs: processedDataset.TFIDFMatrixAbsoluteValues);

            // filter result
            var tfidfThreshold = 0.4f;

            return(MatchFilter.FilterByThreshold(cosineSimilarityValues, dataset, tfidfThreshold));
        }
Esempio n. 3
0
        public static float[] CalculateInputSentenceTFIDFVector(string sentence, ProcessedDataset processedDataset, int ngramSize)
        {
            // calculate ngrams for the sentence
            var ngSize         = ngramSize == default ? NGramSize : ngramSize;
            var sentenceNGrams = NGramsController.GetSentenceNGrams(sentence, ngSize);

            // calculate ngrams frequencies
            var sentenceNGramFrequencies = NGramFrequencyController.GetNGramFrequency(sentenceNGrams);

            // calculate TF vector
            var sentenceTFVectorDataset = TFController.CalculateTFVector(sentenceNGramFrequencies, processedDataset.UniqueNGramsVector);

            // calculate TF-IDF vector
            return(MultiplicationCalculator.MultiplyVectorsByCell(sentenceTFVectorDataset, processedDataset.IDFVector));
        }
        public List <MatchResult> MatchEntities(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, MatchingMethod matchingMethod, float threshold = default, int ngramSize = default)
        {
            var matchingThreshold = threshold == default ? PostProcessingThreshold : threshold;

            switch (matchingMethod)
            {
            case MatchingMethod.NoMatchIndices:
                return(MatchEntitiesWithoutIndices(processedDataset, dataset, inputSentence, ngramSize));

            case MatchingMethod.PreprocessInputSentence:
                return(MatchEntitiesWithIndicesPreTokenizeApproach(processedDataset, dataset, inputSentence, matchingThreshold, ngramSize));

            case MatchingMethod.PostprocessInputSentence:
                return(MatchEntitiesWithIndicesPostTokenizeApproach(processedDataset, dataset, inputSentence, matchingThreshold, ngramSize));

            default:
                throw new Exception("Matching method not supported!");
            }
        }
        private List <MatchResult> MatchEntitiesWithIndicesPreTokenizeApproach(ProcessedDataset processedDataset, List <string> dataset, string inputSentence, float threshold, int ngramSize = 3)
        {
            // get all input sentence possible tokens
            var sentenceTokens = StringTokenizer.GetAllPossibleTokens(inputSentence, processedDataset.MaximumWordCount, ngramSize);

            // calculate tokens TFIDF matrix
            var inputTokensTFIDFMatrix = TFIDFController.CalculateInputSenenteceTokensTFIDFMatrix(sentenceTokens, processedDataset, ngramSize);

            // calculate tokens cosine similarity
            var similarityValuesMatrix = DotProductCalculator.GetDotProduct(inputTokensTFIDFMatrix, processedDataset.TFIDFMatrix, matrix2Abs: processedDataset.TFIDFMatrixAbsoluteValues);

            // filter results
            var tfidfThreshold = 0.5f;
            var tfidfMatches   = MatchFilter.FilterByThresholdBatch(similarityValuesMatrix, dataset, sentenceTokens, tfidfThreshold);

            // post processing
            var updatedScoresMatches = PostprocessingController.UpdateMatchScores(tfidfMatches);

            return(MatchFilter.FilterByThreshold(updatedScoresMatches, threshold));
        }
Esempio n. 6
0
        public static float[][] CalculateInputSenenteceTokensTFIDFMatrix(List <TokenMatchInfo> sentenceTokens, ProcessedDataset processedDataset, int ngramSize)
        {
            // CAN BE MADE MORE EFFICIENT SINCE SOME TOKENS ARE SUBTOKENS OF OTHERS
            // TODO: Implement more efficient solution that makes use of Sub-tokens
            // calculate ngrams for tokens
            var ngSize       = ngramSize == default ? NGramSize : ngramSize;
            var tokensNgrams = NGramsController.GetSentenceNGramsBatch(sentenceTokens, ngSize);

            // calculate N-Gram Frequencies
            var inputTokensNGramFrequencies = NGramFrequencyController.GetNGramFrequencyBatch(tokensNgrams);

            // calculate TF Matrix
            var inputTokensTFMaxtrixDataset = TFController.CalculateTFVectorBatch(inputTokensNGramFrequencies, processedDataset.UniqueNGramsVector);

            // calculate TF-IDF Matrix
            var inputTokensTFIDFMatrixDataset = MultiplicationCalculator.MultiplyVectorsByCellBatch(inputTokensTFMaxtrixDataset, processedDataset.IDFVector);

            return(inputTokensTFIDFMatrixDataset);
        }
Esempio n. 7
0
        /// <summary>
        /// Separate inputs columns from output column (from ExtractedDataset
        /// table) and convert strings into numeric values.
        /// </summary>
        /// <param name="attributeToPredict">Name of the output column.</param>
        /// <param name="codeBook">Codebook to be used (by default null).</param>
        /// <returns>Bool value indicating whether the dataset was
        /// processed correctly or not.</returns>
        public bool ProcessDataset(string attributeToPredict, Codification codeBook = null)
        {
            // ProcessedDataset will have the same structure of ExtractedDataset.
            ProcessedDataset = ExtractedDataset.Clone();

            // Inputs and outputs must preserve ExtractedDataset's dimensions.
            InputData  = new double[ExtractedDataset.Rows.Count][];
            OutputData = new int[ExtractedDataset.Rows.Count];

            // Except for the output column, columns' types are changed to
            // double type (classifiers work with numbers, not with strings).
            foreach (DataColumn column in ExtractedDataset.Columns)
            {
                if (column.ColumnName != attributeToPredict)
                {
                    InputColumnNames.Add(column.ColumnName);
                    ProcessedDataset.Columns[column.Ordinal].DataType = typeof(double);
                }
                else
                {
                    OutputColumnName = column.ColumnName;
                }
            }

            try
            {
                // Temporary variables.
                double        tempValue    = 0;
                DataRow       processedRow = null;
                List <double> tempInput    = null;

                for (int row = 0; row < ExtractedDataset.Rows.Count; ++row)
                {
                    // Process one row at time.
                    processedRow = ProcessedDataset.NewRow();
                    tempInput    = new List <double>();
                    foreach (DataColumn column in ExtractedDataset.Columns)
                    {
                        if (column.ColumnName != attributeToPredict)
                        {
                            Double.TryParse(
                                ExtractedDataset.Rows[row][column.Ordinal] as string,
                                System.Globalization.NumberStyles.Any,
                                System.Globalization.CultureInfo.InvariantCulture,
                                out tempValue);
                            // Create a row of numeric values to be
                            // added to ProcessedDataset and InputData.
                            processedRow[column.Ordinal] = tempValue;
                            tempInput.Add(tempValue);
                        }
                        else
                        {
                            // Don't convert the output column to a number yet, just copy the string
                            // value (conversion to number will be done later by CodeBook).
                            processedRow[column.Ordinal] = ExtractedDataset.Rows[row][column.Ordinal];
                        }
                    }
                    // Add/fill a row in ProcessedDataset and InputData
                    // before going to next row.
                    ProcessedDataset.Rows.Add(processedRow);
                    InputData[row] = tempInput.ToArray();
                }

                if (codeBook != null)
                {
                    // Use given codebook (codebook should be given only when dealing
                    // with testing datasets, in order to have the same codebook both
                    // for training and for testing data).
                    this.CodeBook = codeBook;
                }
                else
                {
                    // If no codebook is given, create one for the output column.
                    CodeBook = new Codification(ExtractedDataset, attributeToPredict);
                }

                // Apply codebook to the ProcessedDataset.
                ProcessedDataset = CodeBook.Apply(ProcessedDataset);

                // InputData is already set, OutputData is set to be the output
                // column codified with the codebook.
                OutputData = ProcessedDataset.ToArray <int>(attributeToPredict);

                // Number of input columns.
                InputAttributeNumber = ExtractedDataset.Columns.Count - 1;

                // Number of possible values the output column may assume.
                OutputPossibleValues = CodeBook[attributeToPredict].Symbols;
            }
            catch
            {
                return(false);
            }
            return(true);
        }