/// <summary> /// Performs a search in the index. /// </summary> /// <param name="query">The search query.</param> /// <param name="documentTypeTags">The document type tags to include in the search.</param> /// <param name="filterDocumentType"><c>true</c> to apply the filter on the document type.</param> /// <param name="options">The search options.</param> /// <param name="fetcher">An object that is able to fetch words.</param> /// <returns>The results.</returns> /// <exception cref="ArgumentNullException">If <paramref name="query"/> or <paramref name="fetcher"/> are <c>null</c>.</exception> /// <exception cref="ArgumentException">If <paramref name="query"/> is empty.</exception> /// <exception cref="ArgumentNullException">If <paramref name="filterDocumentType"/> is <c>true</c> and <paramref name="documentTypeTags"/> is <c>null</c>.</exception> /// <exception cref="ArgumentException">If <paramref name="filterDocumentType"/> is <c>true</c> and <paramref name="documentTypeTags"/> is empty.</exception> public static SearchResultCollection SearchInternal(string query, string[] documentTypeTags, bool filterDocumentType, SearchOptions options, IWordFetcher fetcher) { if (query == null) { throw new ArgumentNullException("query"); } if (query.Length == 0) { throw new ArgumentException("Query cannot be empty", "query"); } if (filterDocumentType && documentTypeTags == null) { throw new ArgumentNullException("documentTypeTags"); } if (filterDocumentType && documentTypeTags.Length == 0) { throw new ArgumentException("documentTypeTags cannot be empty", "documentTypeTags"); } if (fetcher == null) { throw new ArgumentNullException("fetcher"); } SearchResultCollection results = new SearchResultCollection(); query = query.ToLowerInvariant(); string[] queryWords = query.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); float totalRelevance = 0; Word word = null; foreach (string q in queryWords) { if (fetcher.TryGetWord(q, out word)) { foreach (IDocument doc in word.Occurrences.Keys) { // Skip documents with excluded tags if (filterDocumentType && !IsDocumentTypeTagIncluded(doc.TypeTag, documentTypeTags)) { continue; } foreach (BasicWordInfo info in word.Occurrences[doc]) { // If a search result is already present, add a new match to it, // otherwise create a new search result object WordInfo mi = new WordInfo(q, info.FirstCharIndex, info.WordIndex, info.Location); SearchResult res = results.GetSearchResult(doc); if (res == null) { res = new SearchResult(doc); res.Relevance.SetValue(info.Location.RelativeRelevance); res.Matches.Add(mi); results.Add(res); } else { // Avoid adding duplicate matches (happens when query contains the same word multiple times) if (!res.Matches.ContainsOccurrence(mi.Text, mi.FirstCharIndex)) { res.Matches.Add(mi); } res.Relevance.SetValue(res.Relevance.Value + info.Location.RelativeRelevance); } totalRelevance += info.Location.RelativeRelevance; } } } } if (options == SearchOptions.AllWords) { totalRelevance -= PurgeResultsForAllWords(results, queryWords); } else if (options == SearchOptions.ExactPhrase) { totalRelevance -= PurgeResultsForExactPhrase(results, queryWords); } else if (options == SearchOptions.AtLeastOneWord) { // Nothing to do } else { throw new InvalidOperationException("Unsupported SearchOptions"); } // Finalize relevance values for (int i = 0; i < results.Count; i++) { results[i].Relevance.Finalize(totalRelevance); } return(results); }
/// <summary> /// Purges the invalid results when SearchOptions is ExactPhrase. /// </summary> /// <param name="results">The results to purge.</param> /// <param name="queryWords">The query words.</param> /// <returns>The relevance value of the removed matches.</returns> public static float PurgeResultsForExactPhrase(SearchResultCollection results, string[] queryWords) { // Remove results that do not contain the exact phrase float relevanceToRemove = 0; List <SearchResult> toRemove = new List <SearchResult>(); foreach (SearchResult r in results) { // Shortcut if (r.Matches.Count < queryWords.Length) { toRemove.Add(r); } else { // Verify that all matches are in the same order as in the query // and that their indices make up contiguous words, // re-iterating from every word in the result, for example: // query = 'repeated content', result = 'content repeated content' // result must be tested with 'content repeated' (failing) and with 'repeated content' (succeeding) int maxTestShift = 0; if (queryWords.Length < r.Matches.Count) { maxTestShift = r.Matches.Count - queryWords.Length; } bool sequenceFound = false; for (int shift = 0; shift <= maxTestShift; shift++) { int firstWordIndex = r.Matches[shift].WordIndex; bool allOk = true; for (int i = 0; i < queryWords.Length; i++) { if (queryWords[i] != r.Matches[i + shift].Text.ToLowerInvariant() || r.Matches[i + shift].WordIndex != firstWordIndex + i) { //toRemove.Add(r); allOk = false; break; } } if (allOk) { sequenceFound = true; break; } } if (!sequenceFound) { toRemove.Add(r); } } } foreach (SearchResult r in toRemove) { results.Remove(r); relevanceToRemove += r.Relevance.Value; } return(relevanceToRemove); }