/** * input is a map of artifact.Id and processed text of each artifact * */ public static TLDictionaryIndex build(TLArtifactsCollection setOfProcessedDocuments, ComponentLogger logger) { // Variables TLTermEntry termEntry; TLPosting posting; double vectorLength; // Stores the vector lenght of each document - this is used to normalize the term weights // The vector length is the square root of the sum of the squares of all the term weights. Dictionary <string, double> documentVectorLength = new Dictionary <string, double>(); // Creates the dictionary TLDictionaryIndex dict = new TLDictionaryIndex(); // Iterates over all the documents foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { string[] terms = artifact.Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (terms.Length == 0) { logger.Warn(String.Format("Artifact Id {0} is empty.", artifact.Id)); } // Iterates over all the terms foreach (string t in terms) { // Checks if that term has already a posting if (!dict.ContainsTermEntry(t)) { // New term termEntry = dict.AddTermEntry(t, 1, 1, 1.0); posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing term termEntry = dict.GetTermEntry(t); termEntry.TotalFrequencyAcrossArtifacts += 1; termEntry.Weight = 1.0; // Checks if there is already a posting for this document if (!termEntry.ContainsPosting(artifact.Id)) { // New posting termEntry.NumberOfArtifactsContainingTerm += 1; posting = termEntry.AddPosting(artifact.Id, 1, 1.0); } else { // Existing posting posting = termEntry.GetPosting(artifact.Id); posting.Frequency += 1; posting.Weight += 1.0; } } } } string artId; // Now that all the counts are in, it calculates the document vector weights foreach (TLTermEntry t in dict.TermEntries) { foreach (TLPosting p in t.Postings) { artId = p.ArtifactId; vectorLength = Math.Pow(p.Frequency, 2); if (documentVectorLength.ContainsKey(artId)) { // The document has other terms vectorLength += documentVectorLength[artId]; } documentVectorLength[artId] = vectorLength; } } // Finally, we need to get the square root of all entries in the document vector length foreach (TLArtifact artifact in setOfProcessedDocuments.Values) { if (documentVectorLength.ContainsKey(artifact.Id)) { vectorLength = Math.Sqrt(documentVectorLength[artifact.Id]); // Here we update the document vector length of the dictionary - not the internal structure any more dict.SetDocumentVectorWeight(artifact.Id, vectorLength); } } return(dict); }
public List <Result> search(String query, TLDictionaryIndex dict) { // Variables List <Result> results; Dictionary <string, double> intermediateResults; Dictionary <string, int> queryTermFrequency; Dictionary <string, double> queryTermWeight; double queryVectorLength; // Initializes the data structures results = new List <Result>(); // Result array intermediateResults = new Dictionary <string, double>(); // Where the intermediate results of the query are kept. queryTermFrequency = new Dictionary <string, int>(); // Keeps track of term frequencies queryTermWeight = new Dictionary <string, double>(); // Keeps track of term weights queryVectorLength = 0.0; // The document vector length of the query // The query is broken down into tokens string[] queryTerms = query.Split(new char[] { ' ' }); // Iterates over each query term to compute the term frequency foreach (string qterm in queryTerms) { // It only cares about those words that are in the dictionary if (dict.ContainsTermEntry(qterm)) { if (!queryTermFrequency.ContainsKey(qterm)) { // First time the query word is encountered queryTermFrequency.Add(qterm, 1); } else { // The query word is already there, so the frequency gets increased queryTermFrequency[qterm] += 1; } } } // Iterates over the resulting query terms to compute their weights and the dot product of the query terms x the documents terms foreach (string qterm in queryTermFrequency.Keys) { // Gets the Term from the dictionary TLTermEntry term = dict.GetTermEntry(qterm); // It computes the weight of a term - IE the frequency TIMES the term's specificity. // Note: the specifity of the term is stored in the weight. // For the basic dictionary this is just 1 // For the tf-idf dictionary this is the idf // For the signal-noise this is the signal double weight = queryTermFrequency[qterm] * term.Weight; queryTermWeight.Add(qterm, weight); // Updates the document vector length of the query queryVectorLength += Math.Pow(weight, 2); // It now iterates over all the postings that have this term foreach (TLPosting posting in term.Postings) { string docId = posting.ArtifactId; // Calculates the product of the query times the posting for this particular term double r = queryTermWeight[qterm] * posting.Weight; if (intermediateResults.ContainsKey(docId)) { intermediateResults[docId] += r; } else { intermediateResults.Add(docId, r); } } } // The document vector lenght for the query is the square root of the sum of the squares of the term weights queryVectorLength = Math.Sqrt(queryVectorLength); // It iterates over the intermediate results to create the final array that is returned to the user foreach (string docId in intermediateResults.Keys) { // Result r = new ResultObj(docId, intermediateResults.get(docId)); double similarity = _similarityMetric.ComputeSimilarity(intermediateResults[docId], queryVectorLength, dict.GetDocumentVectorWeight(docId)); Result r = new Result(docId, similarity); results.Add(r); } // Sorts the results results.Sort(); return(results); }