Beispiel #1
0
        /**
         * input is a map of artifact.Id and processed text of each artifact
         *
         */
        public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger)
        {
            // Variables
            TLTermEntry termEntry;
            TLPosting   posting;

            double vectorLength;
            // Stores the vector lenght of each document - this is used to normalize the term weights
            // The vector length is the square root of the sum of the squares of all the term weights.
            Dictionary <string, double> documentVectorLength = new Dictionary <string, double>();

            // Creates the dictionary
            TLDictionaryIndex dict = new TLDictionaryIndex();

            // Iterates over all the documents
            foreach (TLArtifact artifact in setOfProcessedDocuments.Values)
            {
                string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                if (terms.Length == 0)
                {
                    logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id));
                }

                // Iterates over all the terms
                foreach (string t in terms)
                {
                    // Checks if that term has already a posting
                    if (!dict.ContainsTermEntry(t))
                    {
                        // New term
                        termEntry = dict.AddTermEntry(t, 1, 1, 1.0);
                        posting   = termEntry.AddPosting(artifact.Id, 1, 1.0);
                    }
                    else
                    {
                        // Existing term
                        termEntry = dict.GetTermEntry(t);
                        termEntry.TotalFrequencyAcrossArtifacts += 1;
                        termEntry.Weight = 1.0;

                        // Checks if there is already a posting for this document
                        if (!termEntry.ContainsPosting(artifact.Id))
                        {
                            // New posting
                            termEntry.NumberOfArtifactsContainingTerm += 1;
                            posting = termEntry.AddPosting(artifact.Id, 1, 1.0);
                        }
                        else
                        {
                            // Existing posting
                            posting            = termEntry.GetPosting(artifact.Id);
                            posting.Frequency += 1;
                            posting.Weight    += 1.0;
                        }
                    }
                }
            }

            string artId;

            // Now that all the counts are in, it calculates the document vector weights
            foreach (TLTermEntry t in dict.TermEntries)
            {
                foreach (TLPosting p in t.Postings)
                {
                    artId        = p.ArtifactId;
                    vectorLength = Math.Pow(p.Frequency, 2);
                    if (documentVectorLength.ContainsKey(artId))
                    {
                        // The document has other terms
                        vectorLength += documentVectorLength[artId];
                    }
                    documentVectorLength[artId] = vectorLength;
                }
            }

            // Finally, we need to get the square root of all entries in the document vector length
            foreach (TLArtifact artifact in setOfProcessedDocuments.Values)
            {
                if (documentVectorLength.ContainsKey(artifact.Id))
                {
                    vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]);
                    // Here we update the document vector length of the dictionary - not the internal structure any more
                    dict.SetDocumentVectorWeight(artifact.Id, vectorLength);
                }
            }

            return(dict);
        }
Beispiel #2
0
        /**
         * input is a map of artifact.Id and processed text of each artifact
         *
         */
        public static new TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger)
        {
            // Variables
            int    N = setOfProcessedDocuments.Count;          // Total Number of Documents
            double idf;
            String docId;
            double vectorLength;
            // Stores the vector lenght of each document - this is used to normalize the term weights
            // The vector length is the square root of the sum of the squares of all the term weights.
            Dictionary <string, double> documentVectorLength = new Dictionary <string, double>();

            // It starts off by calling the parent method, which will calculate the basic frequencies
            TLDictionaryIndex dict = BasicIndexBuilder.build(setOfProcessedDocuments, logger);

            // Iterates over all the terms
            foreach (TLTermEntry term in dict.TermEntries)
            {
                // Calculates the idf for each term - and stores this in the weight of the term - for weighing queries later
                idf         = Math.Log10(N / ((double)term.NumberOfArtifactsContainingTerm));
                term.Weight = idf;

                // Iterates over all the postings
                foreach (TLPosting posting in term.Postings)
                {
                    // Multiplies each term weight by the idf
                    double newWeight = posting.Frequency * idf;
                    posting.Weight = newWeight;

                    // Updates the document vector length
                    docId = posting.ArtifactId;

                    vectorLength = Math.Pow(newWeight, 2);
                    if (documentVectorLength.ContainsKey(docId))
                    {
                        // The document has other terms
                        vectorLength += documentVectorLength[docId];
                    }
                    documentVectorLength[docId] = vectorLength;
                }
            }

            // Now, we need to get the square root of all entries in the document vector length
            foreach (TLArtifact d in setOfProcessedDocuments.Values)
            {
                docId = d.Id;
                if (documentVectorLength.ContainsKey(docId))
                {
                    vectorLength = Math.Sqrt(documentVectorLength[docId]);
                    documentVectorLength[docId] = vectorLength;
                    // Here we update the document vector length of the dictionary - not the internal structure anymore
                    dict.SetDocumentVectorWeight(docId, vectorLength);
                }
            }

            // Lastly, we normalize all the term weights
            foreach (TLTermEntry term in dict.TermEntries)
            {
                foreach (TLPosting posting in term.Postings)
                {
                    vectorLength   = documentVectorLength[posting.ArtifactId];
                    posting.Weight = (posting.Weight / vectorLength);
                }
            }

            return(dict);
        }