/** * input is a map of artifact.Id and processed text of each artifact * */ public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger) { // Variables TLTermEntry termEntry; TLPosting posting; double vectorLength; // Stores the vector lenght of each document - this is used to normalize the term weights // The vector length is the square root of the sum of the squares of all the term weights. Dictionary <string, double> documentVectorLength = new Dictionary <string, double>(); // Creates the dictionary TLDictionaryIndex dict = new TLDictionaryIndex(); // Iterates over all the documents foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (terms.Length == 0) { logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id)); } // Iterates over all the terms foreach (string t in terms) { // Checks if that term has already a posting if (!dict.ContainsTermEntry(t)) { // New term termEntry = dict.AddTermEntry(t, 1, 1, 1.0); posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing term termEntry = dict.GetTermEntry(t); termEntry.TotalFrequencyAcrossArtifacts += 1; termEntry.Weight = 1.0; // Checks if there is already a posting for this document if (!termEntry.ContainsPosting(artifact.Id)) { // New posting termEntry.NumberOfArtifactsContainingTerm += 1; posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing posting posting = termEntry.GetPosting(artifact.Id); posting.Frequency += 1; posting.Weight += 1.0; } } } } string artId; // Now that all the counts are in, it calculates the document vector weights foreach (TLTermEntry t in dict.TermEntries) { foreach (TLPosting p in t.Postings) { artId = p.ArtifactId; vectorLength = Math.Pow(p.Frequency, 2); if (documentVectorLength.ContainsKey(artId)) { // The document has other terms vectorLength += documentVectorLength[artId]; } documentVectorLength[artId] = vectorLength; } } // Finally, we need to get the square root of all entries in the document vector length foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { if (documentVectorLength.ContainsKey(artifact.Id)) { vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]); // Here we update the document vector length of the dictionary - not the internal structure any more dict.SetDocumentVectorWeight(artifact.Id, vectorLength); } } return(dict); }
/** * input is a map of artifact.Id and processed text of each artifact * */ public static new TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger) { // Variables int N = setOfProcessedDocuments.Count; // Total Number of Documents double idf; String docId; double vectorLength; // Stores the vector lenght of each document - this is used to normalize the term weights // The vector length is the square root of the sum of the squares of all the term weights. Dictionary <string, double> documentVectorLength = new Dictionary <string, double>(); // It starts off by calling the parent method, which will calculate the basic frequencies TLDictionaryIndex dict = BasicIndexBuilder.build(setOfProcessedDocuments, logger); // Iterates over all the terms foreach (TLTermEntry term in dict.TermEntries) { // Calculates the idf for each term - and stores this in the weight of the term - for weighing queries later idf = Math.Log10(N / ((double)term.NumberOfArtifactsContainingTerm)); term.Weight = idf; // Iterates over all the postings foreach (TLPosting posting in term.Postings) { // Multiplies each term weight by the idf double newWeight = posting.Frequency * idf; posting.Weight = newWeight; // Updates the document vector length docId = posting.ArtifactId; vectorLength = Math.Pow(newWeight, 2); if (documentVectorLength.ContainsKey(docId)) { // The document has other terms vectorLength += documentVectorLength[docId]; } documentVectorLength[docId] = vectorLength; } } // Now, we need to get the square root of all entries in the document vector length foreach (TLArtifact d in setOfProcessedDocuments.Values) { docId = d.Id; if (documentVectorLength.ContainsKey(docId)) { vectorLength = Math.Sqrt(documentVectorLength[docId]); documentVectorLength[docId] = vectorLength; // Here we update the document vector length of the dictionary - not the internal structure anymore dict.SetDocumentVectorWeight(docId, vectorLength); } } // Lastly, we normalize all the term weights foreach (TLTermEntry term in dict.TermEntries) { foreach (TLPosting posting in term.Postings) { vectorLength = documentVectorLength[posting.ArtifactId]; posting.Weight = (posting.Weight / vectorLength); } } return(dict); }