Beispiel #1
0
        /**
         * input is a map of artifact.Id and processed text of each artifact
         *
         */
        public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger)
        {
            // Variables
            TLTermEntry termEntry;
            TLPosting   posting;

            double vectorLength;
            // Stores the vector lenght of each document - this is used to normalize the term weights
            // The vector length is the square root of the sum of the squares of all the term weights.
            Dictionary <string, double> documentVectorLength = new Dictionary <string, double>();

            // Creates the dictionary
            TLDictionaryIndex dict = new TLDictionaryIndex();

            // Iterates over all the documents
            foreach (TLArtifact artifact in setOfProcessedDocuments.Values)
            {
                string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                if (terms.Length == 0)
                {
                    logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id));
                }

                // Iterates over all the terms
                foreach (string t in terms)
                {
                    // Checks if that term has already a posting
                    if (!dict.ContainsTermEntry(t))
                    {
                        // New term
                        termEntry = dict.AddTermEntry(t, 1, 1, 1.0);
                        posting   = termEntry.AddPosting(artifact.Id, 1, 1.0);
                    }
                    else
                    {
                        // Existing term
                        termEntry = dict.GetTermEntry(t);
                        termEntry.TotalFrequencyAcrossArtifacts += 1;
                        termEntry.Weight = 1.0;

                        // Checks if there is already a posting for this document
                        if (!termEntry.ContainsPosting(artifact.Id))
                        {
                            // New posting
                            termEntry.NumberOfArtifactsContainingTerm += 1;
                            posting = termEntry.AddPosting(artifact.Id, 1, 1.0);
                        }
                        else
                        {
                            // Existing posting
                            posting            = termEntry.GetPosting(artifact.Id);
                            posting.Frequency += 1;
                            posting.Weight    += 1.0;
                        }
                    }
                }
            }

            string artId;

            // Now that all the counts are in, it calculates the document vector weights
            foreach (TLTermEntry t in dict.TermEntries)
            {
                foreach (TLPosting p in t.Postings)
                {
                    artId        = p.ArtifactId;
                    vectorLength = Math.Pow(p.Frequency, 2);
                    if (documentVectorLength.ContainsKey(artId))
                    {
                        // The document has other terms
                        vectorLength += documentVectorLength[artId];
                    }
                    documentVectorLength[artId] = vectorLength;
                }
            }

            // Finally, we need to get the square root of all entries in the document vector length
            foreach (TLArtifact artifact in setOfProcessedDocuments.Values)
            {
                if (documentVectorLength.ContainsKey(artifact.Id))
                {
                    vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]);
                    // Here we update the document vector length of the dictionary - not the internal structure any more
                    dict.SetDocumentVectorWeight(artifact.Id, vectorLength);
                }
            }

            return(dict);
        }
Beispiel #2
0
        public List <Result> search(String query, TLDictionaryIndex dict)
        {
            // Variables
            List <Result> results;
            Dictionary <string, double> intermediateResults;
            Dictionary <string, int>    queryTermFrequency;
            Dictionary <string, double> queryTermWeight;
            double queryVectorLength;

            // Initializes the data structures
            results             = new List <Result>();                  // Result array
            intermediateResults = new Dictionary <string, double>();    // Where the intermediate results of the query are kept.
            queryTermFrequency  = new Dictionary <string, int>();       // Keeps track of term frequencies
            queryTermWeight     = new Dictionary <string, double>();    // Keeps track of term weights
            queryVectorLength   = 0.0;                                  // The document vector length of the query

            // The query is broken down into tokens
            string[] queryTerms = query.Split(new char[] { ' ' });

            // Iterates over each query term to compute the term frequency
            foreach (string qterm in queryTerms)
            {
                // It only cares about those words that are in the dictionary
                if (dict.ContainsTermEntry(qterm))
                {
                    if (!queryTermFrequency.ContainsKey(qterm))
                    {
                        // First time the query word is encountered
                        queryTermFrequency.Add(qterm, 1);
                    }
                    else
                    {
                        // The query word is already there, so the frequency gets increased
                        queryTermFrequency[qterm] += 1;
                    }
                }
            }

            // Iterates over the resulting query terms to compute their weights and the dot product of the query terms x the documents terms
            foreach (string qterm in queryTermFrequency.Keys)
            {
                // Gets the Term from the dictionary
                TLTermEntry term = dict.GetTermEntry(qterm);

                // It computes the weight of a term -  IE the frequency TIMES the term's specificity.
                // Note: the specifity of the term is stored in the weight.
                //      For the basic dictionary this is just 1
                //		For the tf-idf dictionary this is the idf
                //      For the signal-noise this is the signal
                double weight = queryTermFrequency[qterm] * term.Weight;
                queryTermWeight.Add(qterm, weight);

                // Updates the document vector length of the query
                queryVectorLength += Math.Pow(weight, 2);

                // It now iterates over all the postings that have this term
                foreach (TLPosting posting in term.Postings)
                {
                    string docId = posting.ArtifactId;

                    // Calculates the product of the query times the posting for this particular term
                    double r = queryTermWeight[qterm] * posting.Weight;
                    if (intermediateResults.ContainsKey(docId))
                    {
                        intermediateResults[docId] += r;
                    }
                    else
                    {
                        intermediateResults.Add(docId, r);
                    }
                }
            }

            // The document vector lenght for the query is the square root of the sum of the squares of the term weights
            queryVectorLength = Math.Sqrt(queryVectorLength);


            // It iterates over the intermediate results to create the final array that is returned to the user
            foreach (string docId in intermediateResults.Keys)
            {
                // Result r = new ResultObj(docId, intermediateResults.get(docId));
                double similarity = _similarityMetric.ComputeSimilarity(intermediateResults[docId], queryVectorLength, dict.GetDocumentVectorWeight(docId));
                Result r          = new Result(docId, similarity);
                results.Add(r);
            }

            // Sorts the results
            results.Sort();
            return(results);
        }