Exemplo n.º 1
0
        public List <Result> search(String query, TLDictionaryIndex dict)
        {
            // Variables
            List <Result> results;
            Dictionary <string, double> intermediateResults;
            Dictionary <string, int>    queryTermFrequency;
            Dictionary <string, double> queryTermWeight;
            double queryVectorLength;

            // Initializes the data structures
            results             = new List <Result>();                  // Result array
            intermediateResults = new Dictionary <string, double>();    // Where the intermediate results of the query are kept.
            queryTermFrequency  = new Dictionary <string, int>();       // Keeps track of term frequencies
            queryTermWeight     = new Dictionary <string, double>();    // Keeps track of term weights
            queryVectorLength   = 0.0;                                  // The document vector length of the query

            // The query is broken down into tokens
            string[] queryTerms = query.Split(new char[] { ' ' });

            // Iterates over each query term to compute the term frequency
            foreach (string qterm in queryTerms)
            {
                // It only cares about those words that are in the dictionary
                if (dict.ContainsTermEntry(qterm))
                {
                    if (!queryTermFrequency.ContainsKey(qterm))
                    {
                        // First time the query word is encountered
                        queryTermFrequency.Add(qterm, 1);
                    }
                    else
                    {
                        // The query word is already there, so the frequency gets increased
                        queryTermFrequency[qterm] += 1;
                    }
                }
            }

            // Iterates over the resulting query terms to compute their weights and the dot product of the query terms x the documents terms
            foreach (string qterm in queryTermFrequency.Keys)
            {
                // Gets the Term from the dictionary
                TLTermEntry term = dict.GetTermEntry(qterm);

                // It computes the weight of a term -  IE the frequency TIMES the term's specificity.
                // Note: the specifity of the term is stored in the weight.
                //      For the basic dictionary this is just 1
                //		For the tf-idf dictionary this is the idf
                //      For the signal-noise this is the signal
                double weight = queryTermFrequency[qterm] * term.Weight;
                queryTermWeight.Add(qterm, weight);

                // Updates the document vector length of the query
                queryVectorLength += Math.Pow(weight, 2);

                // It now iterates over all the postings that have this term
                foreach (TLPosting posting in term.Postings)
                {
                    string docId = posting.ArtifactId;

                    // Calculates the product of the query times the posting for this particular term
                    double r = queryTermWeight[qterm] * posting.Weight;
                    if (intermediateResults.ContainsKey(docId))
                    {
                        intermediateResults[docId] += r;
                    }
                    else
                    {
                        intermediateResults.Add(docId, r);
                    }
                }
            }

            // The document vector lenght for the query is the square root of the sum of the squares of the term weights
            queryVectorLength = Math.Sqrt(queryVectorLength);


            // It iterates over the intermediate results to create the final array that is returned to the user
            foreach (string docId in intermediateResults.Keys)
            {
                // Result r = new ResultObj(docId, intermediateResults.get(docId));
                double similarity = _similarityMetric.ComputeSimilarity(intermediateResults[docId], queryVectorLength, dict.GetDocumentVectorWeight(docId));
                Result r          = new Result(docId, similarity);
                results.Add(r);
            }

            // Sorts the results
            results.Sort();
            return(results);
        }