public IEnumerable <SearchResult> Search(string query, int limit = 10) { if (!this.IsQueryValid(query)) { throw new ArgumentException("The provided query is invalid."); } var queryTerms = this.TokenizeQuery(query).Distinct(); var queryLM = new BagOfWords(queryTerms); var relevantDocuments = this.invertedIndex.GetDocumentsContainingTerms(queryTerms); queryTerms.ForEach(term => { if (!idfCache.ContainsKey(term)) { idfCache.Add(term, this.invertedIndex.GetInverseDocumentFrequency(term)); } }); return(relevantDocuments //.Select(doc => new SearchResult(doc, this.CalculateKullbackLeiblerDivergence(queryLM, doc.BagOfWords))) //.OrderBy(sr => sr.RelevanceScore) .Select(doc => new SearchResult(doc, this.CalculateTfIdfRelevanceScore(queryLM.DistinctTerms, doc))) .OrderByDescending(sr => sr.RelevanceScore) .Take(limit)); }
public void LoadDocuments(string directoryPath) { var documents = new List <Document>(); var corpusText = new List <string>(); Directory.EnumerateFiles(directoryPath).ForEach(filePath => { documents.AddRange( JsonConvert.DeserializeObject <IEnumerable <Document> >(File.ReadAllText(filePath)) .Where(doc => doc.Body != null && doc.Body.Length > MinDocBodyLength) .Select(doc => { var content = $"{doc.Title} {doc.Body} "; var tokeinzedDocumentContent = this.tokenizer.Tokenize(content); doc.BagOfWords = new BagOfWords(tokeinzedDocumentContent); corpusText.AddRange(tokeinzedDocumentContent); return(doc); })); }); this.documents = documents; this.corpusBagOfWords = new BagOfWords(corpusText); this.invertedIndex = new InvertedIndex(documents); }
private double CalculateKullbackLeiblerDivergence(BagOfWords queryLM, BagOfWords documentLM) { var result = 0.0; queryLM.DistinctTerms.ForEach(term => { var queryLMProbability = queryLM.GetTermFrequency(term); var docLMProbability = documentLM.GetTermFrequency(term); if (docLMProbability == 0) { docLMProbability = this.corpusBagOfWords.GetTermFrequency(term); } if (docLMProbability > 0) { result += (queryLMProbability * Math.Log(queryLMProbability / docLMProbability)); } }); return(result); }