private static double[] CalculateDocumentQueryVector(int[] termFrequencyInDocument, TDMParameters myTDMParameters, double averageLengthOfPhrases, double[] vectorIDF) { var totalTerms = termFrequencyInDocument.Length; var maximumFrequency = 0.0; var documentLength = 0; for (var j = 0; j < totalTerms; j++) { documentLength += termFrequencyInDocument[j]; if (termFrequencyInDocument[j] > maximumFrequency) { maximumFrequency = termFrequencyInDocument[j]; } } var documentTFIDFVector = new double[totalTerms]; for (var j = 0; j < totalTerms; j++) { documentTFIDFVector[j] = MultiplyTFbyIDFUsingWeights(termFrequencyInDocument[j], vectorIDF[j], myTDMParameters.TheTFIDFWeight, maximumFrequency, documentLength, averageLengthOfPhrases); } return(documentTFIDFVector); }
private static double[] CalculateTitleWeights(string processedTitle, Dictionary <string, int> termsTable, TDMParameters myTDMParameters, double averageLengthOfPhrases, double[] vectorIDF) { var titleTFVector = new double[termsTable.Count]; if (processedTitle.Length == 0) { return(titleTFVector); } var terms = processedTitle.Split(' '); var maximumFrequency = 0.0; var noRepeatedTerms = new List <string>(); foreach (var term in terms) { if (!termsTable.ContainsKey(term)) { continue; } var posTerm = termsTable[term]; titleTFVector[posTerm] = titleTFVector[posTerm] + 1; if (!noRepeatedTerms.Contains(term)) { noRepeatedTerms.Add(term); } if (titleTFVector[posTerm] > maximumFrequency) { maximumFrequency = titleTFVector[posTerm]; } } var titleLength = terms.Length; var titleTFIDFVector = new double[termsTable.Count]; foreach (var term in noRepeatedTerms) { var posTerm = termsTable[term]; titleTFIDFVector[posTerm] = MultiplyTFbyIDFUsingWeights(titleTFVector[posTerm], vectorIDF[posTerm], myTDMParameters.TheTFIDFWeight, maximumFrequency, titleLength, averageLengthOfPhrases); } return(titleTFIDFVector); }
public TDM(string documentFile, TDMParameters myTDMParameters, string cacheFileName) { var extension = myTDMParameters.TheTFIDFWeight == TFIDFWeight.Doc2Vec ? ".d2v" : ".tdm"; MyTDMParameters = myTDMParameters; if (cacheFileName.Length != 0) { if (File.Exists(cacheFileName + extension)) { ReadFromFile(cacheFileName + extension); return; } } // Sentences are indexed in RAM using Lucene var theIndexer = new LuceneIndexer(); theIndexer.InMemory(documentFile); PhrasesList = theIndexer.PhraseList; var processedTitle = theIndexer.DocumentTitle; if (MyTDMParameters.TheTFIDFWeight == TFIDFWeight.Doc2Vec) { CalculateDoc2Vec(PhrasesList, theIndexer.DocumentTitle, cacheFileName, out TFIDFMatrix, out _documentTFIDFVector, out _titleTFIDFVector); SaveToFile(cacheFileName + extension); return; } // Organize the dictionary of direct and inverted terms after processing with LUCENE var counter = 0; foreach (var term in theIndexer.TermsList) { if (term == " ") { continue; } _termsTable.Add(term, counter); _invertedTermsTable.Add(counter, term); counter++; } Debug.WriteLine("Original TDM is: " + PhrasesList.Count + " x " + _termsTable.Count); CreateTDMMatrix(PhrasesList, _termsTable, myTDMParameters, out TFIDFMatrix, out var termFrequencyInCollection, out _, out _idfVector, out _documentTFIDFVector, out _averageLengthOfPhrases); // If you change the term dictionary (TermsTable) for terms that do not exceed // the threshold, all structures are recalculated if (RemoveTermsThatAreNotSignificant(termFrequencyInCollection, MyTDMParameters.MinimumFrequencyThresholdOfTermsForPhrase)) { CreateTDMMatrix(PhrasesList, _termsTable, myTDMParameters, out TFIDFMatrix, out termFrequencyInCollection, out _, out _idfVector, out _documentTFIDFVector, out _averageLengthOfPhrases); } // If you change the list of phrases by sentences that do not exceed // the threshold, all structures are recalculated if (RemovePhrasesThatAreNotSignificant()) { UpdateTermsTable(PhrasesList); CreateTDMMatrix(PhrasesList, _termsTable, myTDMParameters, out TFIDFMatrix, out termFrequencyInCollection, out _, out _idfVector, out _documentTFIDFVector, out _averageLengthOfPhrases); } Debug.WriteLine("TDM final de : " + PhrasesList.Count + " x " + _termsTable.Count); _titleTFIDFVector = CalculateTitleWeights(processedTitle, _termsTable, myTDMParameters, _averageLengthOfPhrases, _idfVector); SortSimilaritiesToTitle(); SaveToFile(cacheFileName + extension); }
private static void CreateTDMMatrix(List <PhraseData> phrasesList, Dictionary <string, int> termsTable, TDMParameters myTDMParameters, out double[][] matrixTFIDF, out int[] termFrequencyInCollection, out int[] observedIDFVector, out double[] vectorIDF, out double[] documentTFIDFVector, out double averageLengthOfPhrases) { // The observed frequency of each term i is calculated in each phrase (TF or Fi), // and the frequency of each term is stored in the entire collection and the // number of sentences in which a term appears in the entire document matrixTFIDF = new double[phrasesList.Count][]; termFrequencyInCollection = new int[termsTable.Count]; observedIDFVector = new int[termsTable.Count]; vectorIDF = new double[termsTable.Count]; averageLengthOfPhrases = phrasesList.Aggregate(0.0, (current, phrase) => current + phrase.ProcessedLength); averageLengthOfPhrases /= phrasesList.Count; for (var posPhrase = 0; posPhrase < phrasesList.Count; posPhrase++) { matrixTFIDF[posPhrase] = new double[termsTable.Count]; var termsToRemove = new List <string>(); foreach (var term in phrasesList[posPhrase].UniqueTermsInProcessedText) { if (termsTable.ContainsKey(term.Key)) { var posTerm = termsTable[term.Key]; matrixTFIDF[posPhrase][posTerm] = term.Value; termFrequencyInCollection[posTerm] += term.Value; observedIDFVector[posTerm] += 1; } else { //The terms that should be eliminated in the sentence are marked because they are not in the collection termsToRemove.Add(term.Key); } } // The terms that are not included in the collection are eliminated foreach (var term in termsToRemove) { phrasesList[posPhrase].RemoveTerm(term); } } // The IDFVector is calculated. It store the IDF value for each term in the corpus // Simple Weight IDF i = ln ( N / ni ) = LexRank // Complete Weight IDF i = ln ( N / (ni+1) ) // BM25 Weight IDF i = ln ( N / ni ) // Best Weight IDF i = log10 ( N / ni ) // where N is the total number of phrases and ni is the number of phrases in which the term i appears for (var posTerm = 0; posTerm < termsTable.Count; posTerm++) { double result2 = observedIDFVector[posTerm]; switch (myTDMParameters.TheTFIDFWeight) { case TFIDFWeight.Simple: vectorIDF[posTerm] = Math.Log(phrasesList.Count / result2); break; case TFIDFWeight.Complete: vectorIDF[posTerm] = Math.Log(phrasesList.Count / (result2 + 1)); break; case TFIDFWeight.BM25: vectorIDF[posTerm] = Math.Log(phrasesList.Count / result2); break; case TFIDFWeight.Best: vectorIDF[posTerm] = Math.Log10(phrasesList.Count / result2); break; default: throw new ArgumentOutOfRangeException(); } } // 4 Weight Matrix (W) - [TF-IDF] for each term y Phrase ... IDF value is stored in a separated vector for (var posPhrase = 0; posPhrase < phrasesList.Count; posPhrase++) { var lengthThisPhrase = phrasesList[posPhrase].ProcessedLength; foreach (var term in phrasesList[posPhrase].UniqueTermsInProcessedText) { var posTerm = termsTable[term.Key]; matrixTFIDF[posPhrase][posTerm] = MultiplyTFbyIDFUsingWeights(matrixTFIDF[posPhrase][posTerm], vectorIDF[posTerm], myTDMParameters.TheTFIDFWeight, phrasesList[posPhrase].MaximumFrequency, lengthThisPhrase, averageLengthOfPhrases); } } // 5. Calculate the centroid or query vector that represents the document documentTFIDFVector = null; switch (myTDMParameters.TheDocumentRepresentation) { case DocumentRepresentation.Centroid: documentTFIDFVector = CalculateCentroidOfDocument(matrixTFIDF); break; case DocumentRepresentation.Vector: documentTFIDFVector = CalculateDocumentQueryVector(termFrequencyInCollection, myTDMParameters, averageLengthOfPhrases, vectorIDF); break; } }