Exemple #1
0
        private static double[] CalculateDocumentQueryVector(int[] termFrequencyInDocument,
                                                             TDMParameters myTDMParameters, double averageLengthOfPhrases, double[] vectorIDF)
        {
            var totalTerms       = termFrequencyInDocument.Length;
            var maximumFrequency = 0.0;
            var documentLength   = 0;

            for (var j = 0; j < totalTerms; j++)
            {
                documentLength += termFrequencyInDocument[j];
                if (termFrequencyInDocument[j] > maximumFrequency)
                {
                    maximumFrequency = termFrequencyInDocument[j];
                }
            }

            var documentTFIDFVector = new double[totalTerms];

            for (var j = 0; j < totalTerms; j++)
            {
                documentTFIDFVector[j] = MultiplyTFbyIDFUsingWeights(termFrequencyInDocument[j],
                                                                     vectorIDF[j], myTDMParameters.TheTFIDFWeight,
                                                                     maximumFrequency, documentLength, averageLengthOfPhrases);
            }

            return(documentTFIDFVector);
        }
Exemple #2
0
        private static double[] CalculateTitleWeights(string processedTitle,
                                                      Dictionary <string, int> termsTable, TDMParameters myTDMParameters,
                                                      double averageLengthOfPhrases, double[] vectorIDF)
        {
            var titleTFVector = new double[termsTable.Count];

            if (processedTitle.Length == 0)
            {
                return(titleTFVector);
            }
            var terms            = processedTitle.Split(' ');
            var maximumFrequency = 0.0;
            var noRepeatedTerms  = new List <string>();

            foreach (var term in terms)
            {
                if (!termsTable.ContainsKey(term))
                {
                    continue;
                }
                var posTerm = termsTable[term];
                titleTFVector[posTerm] = titleTFVector[posTerm] + 1;

                if (!noRepeatedTerms.Contains(term))
                {
                    noRepeatedTerms.Add(term);
                }

                if (titleTFVector[posTerm] > maximumFrequency)
                {
                    maximumFrequency = titleTFVector[posTerm];
                }
            }
            var titleLength = terms.Length;

            var titleTFIDFVector = new double[termsTable.Count];

            foreach (var term in noRepeatedTerms)
            {
                var posTerm = termsTable[term];
                titleTFIDFVector[posTerm] = MultiplyTFbyIDFUsingWeights(titleTFVector[posTerm],
                                                                        vectorIDF[posTerm], myTDMParameters.TheTFIDFWeight,
                                                                        maximumFrequency, titleLength, averageLengthOfPhrases);
            }

            return(titleTFIDFVector);
        }
Exemple #3
0
        public TDM(string documentFile, TDMParameters myTDMParameters, string cacheFileName)
        {
            var extension = myTDMParameters.TheTFIDFWeight == TFIDFWeight.Doc2Vec ? ".d2v" : ".tdm";

            MyTDMParameters = myTDMParameters;

            if (cacheFileName.Length != 0)
            {
                if (File.Exists(cacheFileName + extension))
                {
                    ReadFromFile(cacheFileName + extension);
                    return;
                }
            }

            // Sentences are indexed in RAM using Lucene
            var theIndexer = new LuceneIndexer();

            theIndexer.InMemory(documentFile);
            PhrasesList = theIndexer.PhraseList;
            var processedTitle = theIndexer.DocumentTitle;

            if (MyTDMParameters.TheTFIDFWeight == TFIDFWeight.Doc2Vec)
            {
                CalculateDoc2Vec(PhrasesList, theIndexer.DocumentTitle,
                                 cacheFileName, out TFIDFMatrix, out _documentTFIDFVector,
                                 out _titleTFIDFVector);

                SaveToFile(cacheFileName + extension);
                return;
            }

            // Organize the dictionary of direct and inverted terms after processing with LUCENE
            var counter = 0;

            foreach (var term in theIndexer.TermsList)
            {
                if (term == " ")
                {
                    continue;
                }
                _termsTable.Add(term, counter);
                _invertedTermsTable.Add(counter, term);
                counter++;
            }

            Debug.WriteLine("Original TDM is: " + PhrasesList.Count + " x " + _termsTable.Count);

            CreateTDMMatrix(PhrasesList, _termsTable, myTDMParameters,
                            out TFIDFMatrix, out var termFrequencyInCollection,
                            out _, out _idfVector, out _documentTFIDFVector, out _averageLengthOfPhrases);

            // If you change the term dictionary (TermsTable) for terms that do not exceed
            // the threshold, all structures are recalculated
            if (RemoveTermsThatAreNotSignificant(termFrequencyInCollection, MyTDMParameters.MinimumFrequencyThresholdOfTermsForPhrase))
            {
                CreateTDMMatrix(PhrasesList, _termsTable, myTDMParameters,
                                out TFIDFMatrix, out termFrequencyInCollection,
                                out _, out _idfVector, out _documentTFIDFVector, out _averageLengthOfPhrases);
            }

            // If you change the list of phrases by sentences that do not exceed
            // the threshold, all structures are recalculated
            if (RemovePhrasesThatAreNotSignificant())
            {
                UpdateTermsTable(PhrasesList);
                CreateTDMMatrix(PhrasesList, _termsTable, myTDMParameters, out TFIDFMatrix,
                                out termFrequencyInCollection,
                                out _, out _idfVector, out _documentTFIDFVector, out _averageLengthOfPhrases);
            }
            Debug.WriteLine("TDM final de : " + PhrasesList.Count + " x " + _termsTable.Count);

            _titleTFIDFVector = CalculateTitleWeights(processedTitle, _termsTable, myTDMParameters, _averageLengthOfPhrases, _idfVector);

            SortSimilaritiesToTitle();

            SaveToFile(cacheFileName + extension);
        }
Exemple #4
0
        private static void CreateTDMMatrix(List <PhraseData> phrasesList, Dictionary <string, int> termsTable,
                                            TDMParameters myTDMParameters, out double[][] matrixTFIDF, out int[] termFrequencyInCollection,
                                            out int[] observedIDFVector, out double[] vectorIDF, out double[] documentTFIDFVector,
                                            out double averageLengthOfPhrases)
        {
            // The observed frequency of each term i is calculated in each phrase (TF or Fi),
            // and the frequency of each term is stored in the entire collection and the
            // number of sentences in which a term appears in the entire document
            matrixTFIDF = new double[phrasesList.Count][];
            termFrequencyInCollection = new int[termsTable.Count];
            observedIDFVector         = new int[termsTable.Count];
            vectorIDF = new double[termsTable.Count];

            averageLengthOfPhrases  = phrasesList.Aggregate(0.0, (current, phrase) => current + phrase.ProcessedLength);
            averageLengthOfPhrases /= phrasesList.Count;

            for (var posPhrase = 0; posPhrase < phrasesList.Count; posPhrase++)
            {
                matrixTFIDF[posPhrase] = new double[termsTable.Count];
                var termsToRemove = new List <string>();
                foreach (var term in phrasesList[posPhrase].UniqueTermsInProcessedText)
                {
                    if (termsTable.ContainsKey(term.Key))
                    {
                        var posTerm = termsTable[term.Key];
                        matrixTFIDF[posPhrase][posTerm]     = term.Value;
                        termFrequencyInCollection[posTerm] += term.Value;
                        observedIDFVector[posTerm]         += 1;
                    }
                    else
                    {
                        //The terms that should be eliminated in the sentence are marked because they are not in the collection
                        termsToRemove.Add(term.Key);
                    }
                }

                // The terms that are not included in the collection are eliminated
                foreach (var term in termsToRemove)
                {
                    phrasesList[posPhrase].RemoveTerm(term);
                }
            }

            // The IDFVector is calculated. It store the IDF value for each term in the corpus
            //    Simple Weight   IDF i = ln ( N / ni ) = LexRank
            //    Complete Weight IDF i = ln ( N / (ni+1) )
            //    BM25 Weight     IDF i = ln ( N / ni )
            //    Best Weight     IDF i = log10 ( N / ni )
            //    where N is the total number of phrases and ni is the number of phrases in which the term i appears
            for (var posTerm = 0; posTerm < termsTable.Count; posTerm++)
            {
                double result2 = observedIDFVector[posTerm];

                switch (myTDMParameters.TheTFIDFWeight)
                {
                case TFIDFWeight.Simple:
                    vectorIDF[posTerm] = Math.Log(phrasesList.Count / result2);
                    break;

                case TFIDFWeight.Complete:
                    vectorIDF[posTerm] = Math.Log(phrasesList.Count / (result2 + 1));
                    break;

                case TFIDFWeight.BM25:
                    vectorIDF[posTerm] = Math.Log(phrasesList.Count / result2);
                    break;

                case TFIDFWeight.Best:
                    vectorIDF[posTerm] = Math.Log10(phrasesList.Count / result2);
                    break;

                default:
                    throw new ArgumentOutOfRangeException();
                }
            }

            // 4 Weight Matrix (W) - [TF-IDF] for each term y Phrase ... IDF value is stored in a separated vector
            for (var posPhrase = 0; posPhrase < phrasesList.Count; posPhrase++)
            {
                var lengthThisPhrase = phrasesList[posPhrase].ProcessedLength;
                foreach (var term in phrasesList[posPhrase].UniqueTermsInProcessedText)
                {
                    var posTerm = termsTable[term.Key];
                    matrixTFIDF[posPhrase][posTerm] = MultiplyTFbyIDFUsingWeights(matrixTFIDF[posPhrase][posTerm],
                                                                                  vectorIDF[posTerm], myTDMParameters.TheTFIDFWeight, phrasesList[posPhrase].MaximumFrequency,
                                                                                  lengthThisPhrase, averageLengthOfPhrases);
                }
            }

            // 5. Calculate the centroid or query vector that represents the document
            documentTFIDFVector = null;
            switch (myTDMParameters.TheDocumentRepresentation)
            {
            case DocumentRepresentation.Centroid:
                documentTFIDFVector = CalculateCentroidOfDocument(matrixTFIDF);
                break;

            case DocumentRepresentation.Vector:
                documentTFIDFVector = CalculateDocumentQueryVector(termFrequencyInCollection, myTDMParameters, averageLengthOfPhrases, vectorIDF);
                break;
            }
        }