Exemplo n.º 1
0
        public List<Result> search(String query, TLDictionaryIndex dict, Dictionary<string, double> ancTermsWeights)
        {
		   
            // Variables
		    List<Result> results;
		    Dictionary<string, double> intermediateResults;
		    Dictionary<string, int> queryTermFrequency;
		    Dictionary<string, double> queryTermWeight;
		    double queryVectorLength;
		
		    // Initializes the data structures
		    results = new List<Result>();						// Result array
		    intermediateResults = new Dictionary<string, double>();	// Where the intermediate results of the query are kept.
		    queryTermFrequency = new Dictionary<string, int>();	// Keeps track of term frequencies
		    queryTermWeight = new Dictionary<string, double>();		// Keeps track of term weights
		    queryVectorLength = 0.0;								// The document vector length of the query
		
		    // The query is broken down into tokens
            string[] queryTerms = query.Split(new char[] { ' ' } );
		
		    // Iterates over each query term to compute the term frequency 
		    foreach (string qterm in queryTerms) {

			    // It only cares about those words that are in the dictionary
			    if (dict.ContainsTermEntry(qterm)) {
				    if (!queryTermFrequency.ContainsKey(qterm)) {
					    // First time the query word is encountered
					    queryTermFrequency.Add(qterm, 1);
				    } else {
					    // The query word is already there, so the frequency gets increased
					    queryTermFrequency[qterm] += 1;
				    }
			    }
		    }
		
		    // Iterates over the resulting query terms to compute their weights and the dot product of the query terms x the documents terms
		    foreach (string qterm in queryTermFrequency.Keys) {

			    // Gets the Term from the dictionary
			    TLTermEntry term = dict.GetTermEntry(qterm);
			
			    // It computes the weight of a term -  IE the frequency TIMES the term's specificity.
			    // Note: the specifity of the term is stored in the weight.
			    // 		For the basic dictionary this is just 1
			    //		For the tf-idf dictionary this is the idf
			    // 		For the signal-noise this is the signal
			    
                //double weight = queryTermFrequency[qterm] * term.Weight;
                double ancWeight;
                if(ancTermsWeights.TryGetValue(qterm, out ancWeight) == false) ancWeight = 1.0;
                

                double weight = queryTermFrequency[qterm] * ancWeight;
			    queryTermWeight.Add(qterm, weight);
			
			    // Updates the document vector length of the query
			    queryVectorLength += Math.Pow(weight, 2);

			    // It now iterates over all the postings that have this term
			    foreach (TLPosting posting in term.Postings) {
				    
                    string docId = posting.ArtifactId;
		
				    // Calculates the product of the query times the posting for this particular term
				    double r = queryTermWeight[qterm] * posting.Weight;
				    if (intermediateResults.ContainsKey(docId)) {
					    intermediateResults[docId] += r;
				    } else {
				        intermediateResults.Add(docId, r);
                    }
			    }
		    }

		    // The document vector lenght for the query is the square root of the sum of the squares of the term weights
		    queryVectorLength = Math.Sqrt(queryVectorLength);
		
		
		    // It iterates over the intermediate results to create the final array that is returned to the user
		    foreach (string docId in intermediateResults.Keys) {
			    // Result r = new ResultObj(docId, intermediateResults.get(docId));
			    double similarity = _similarityMetric.ComputeSimilarity(intermediateResults[docId], queryVectorLength, dict.GetDocumentVectorWeight(docId));
			    Result r = new Result(docId, similarity);
			    results.Add(r);
		    }
		
		    // Sorts the results
		    results.Sort();
		    return results;
	    }
		/**
		 * input is a map of artifact.Id and processed text of each artifact
		 * 
		 */
		public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger)
		{
			// Variables
			TLTermEntry termEntry;
			TLPosting posting;
		   
			double vectorLength;
			// Stores the vector lenght of each document - this is used to normalize the term weights
			// The vector length is the square root of the sum of the squares of all the term weights.
			Dictionary<string, double> documentVectorLength = new Dictionary<string, double>();	
		
			// Creates the dictionary
			TLDictionaryIndex dict = new TLDictionaryIndex(); 
		
			// Iterates over all the documents
			foreach (TLArtifact artifact in setOfProcessedDocuments.Values)  
			{

				string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
			
				if (terms.Length == 0)
				{
					logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id));
				}

				// Iterates over all the terms
				foreach (string t in terms) {

					// Checks if that term has already a posting
					if(!dict.ContainsTermEntry(t)) {
						// New term
						termEntry = dict.AddTermEntry(t, 1, 1, 1.0);
						posting = termEntry.AddPosting(artifact.Id, 1, 1.0);
					} else {
						// Existing term
						termEntry = dict.GetTermEntry(t);
						termEntry.TotalFrequencyAcrossArtifacts += 1;
						termEntry.Weight = 1.0;
					
						// Checks if there is already a posting for this document
						if (!termEntry.ContainsPosting(artifact.Id)) {
							// New posting
							termEntry.NumberOfArtifactsContainingTerm += 1;
							posting = termEntry.AddPosting(artifact.Id, 1, 1.0);
						
						} else {
							// Existing posting
							posting = termEntry.GetPosting(artifact.Id);
							posting.Frequency += 1;
							posting.Weight += 1.0;
						}
					}
				}
			}

			string artId;
			// Now that all the counts are in, it calculates the document vector weights
			foreach (TLTermEntry t in dict.TermEntries) {
				foreach (TLPosting p in t.Postings) {
					artId = p.ArtifactId;
					vectorLength = Math.Pow(p.Frequency, 2);
					if (documentVectorLength.ContainsKey(artId)) {
						// The document has other terms
						vectorLength += documentVectorLength[artId];
					}
					documentVectorLength[artId] = vectorLength;
				}
			}
			
			// Finally, we need to get the square root of all entries in the document vector length
			foreach (TLArtifact artifact in setOfProcessedDocuments.Values)  
			{
				if (documentVectorLength.ContainsKey(artifact.Id))
				{
					vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]);
					// Here we update the document vector length of the dictionary - not the internal structure any more
					dict.SetDocumentVectorWeight(artifact.Id, vectorLength);
				}		
			}		
		
			return dict;
		}