public IndexGenerator( NonNullImmutableList <ContentRetriever <TSource, TKey> > contentRetrievers, IEqualityComparer <TKey> dataKeyComparer, IStringNormaliser sourceStringComparer, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightedEntryCombiner, bool captureSourceLocations, ILogger logger) { _contentRetrievers = contentRetrievers ?? throw new ArgumentNullException(nameof(contentRetrievers)); _dataKeyComparer = dataKeyComparer ?? throw new ArgumentNullException(nameof(dataKeyComparer)); _sourceStringComparer = sourceStringComparer ?? throw new ArgumentNullException(nameof(sourceStringComparer)); _tokenBreaker = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker)); _weightedEntryCombiner = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner)); _captureSourceLocations = captureSourceLocations; _logger = logger ?? throw new ArgumentNullException(nameof(logger)); }
public QueryTranslator( IIndexData <TKey> standardMatchIndexData, IIndexData <TKey> preciseMatchIndexData, ITokenBreaker optionalQuotedValueConsecutiveTermTokenBreaker, IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns, IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForFinalMatches, MatchCombiner matchCombiner) { if (standardMatchIndexData == null) { throw new ArgumentNullException("standardMatchIndexData"); } if (preciseMatchIndexData == null) { throw new ArgumentNullException("preciseMatchIndexData"); } if (matchCombiner == null) { throw new ArgumentNullException("matchCombiner"); } if (!preciseMatchIndexData.SourceLocationsAvailable) { throw new ArgumentException($"The {nameof(preciseMatchIndexData)} must include source location data in order to use the Query Translator"); } // Can't actually determine for sure that the KeyComparer of the standardMatchIndexData is equivalent to that of the preciseMatchIndexData // (can't do an instance comparison since they may be different instances of the same implementation, they could even feasibly be different // classes with identical functionality) so we'll have to assume that the caller is behaving themselves. We'll take the KeyComparer of the // standardMatchIndexData for use when combining keys, excluding keys or otherwise processing the query segment requirements. _standardMatcher = new CachingResultMatcher(standardMatchIndexData.GetMatches); _preciseMatcher = new CachingResultMatcher( source => preciseMatchIndexData.GetConsecutiveMatches( source, optionalQuotedValueConsecutiveTermTokenBreaker ?? IndexData_Extensions_ConsecutiveMatches.DefaultTokenBreaker, optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns ?? IndexData_Extensions_ConsecutiveMatches.DefaultConsecutiveRunsWeightCombiner, optionalQuotedValueConsecutiveWeightCombinerForFinalMatches ?? IndexData_Extensions_ConsecutiveMatches.DefaultFinalMatchWeightCombiner ) ); _keyComparer = standardMatchIndexData.KeyComparer; _matchCombiner = matchCombiner; }
public AutomatedIndexGeneratorFactory( Func <TSource, TKey> keyRetriever, IEqualityComparer <TKey> keyComparer, IStringNormaliser stringNormaliser, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightedEntryCombiner, WeightDeterminerGenerator brokenTokenWeightDeterminerGenerator, PropertyInfo optionalPropertyForFirstContentRetriever, bool captureSourceLocations, ILogger logger) { _keyRetriever = keyRetriever ?? throw new ArgumentNullException(nameof(keyRetriever)); _keyComparer = keyComparer ?? throw new ArgumentNullException(nameof(keyComparer)); _stringNormaliser = stringNormaliser ?? throw new ArgumentNullException(nameof(stringNormaliser)); _tokenBreaker = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker)); _weightedEntryCombiner = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner)); _brokenTokenWeightDeterminerGenerator = brokenTokenWeightDeterminerGenerator ?? throw new ArgumentNullException(nameof(brokenTokenWeightDeterminerGenerator)); _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever; _captureSourceLocations = captureSourceLocations; _logger = logger ?? throw new ArgumentNullException(nameof(logger)); }
private AutomatedIndexGeneratorFactoryBuilder( Func <TSource, TKey> keyRetrieverOverride, IEqualityComparer <TKey> keyComparerOverride, IStringNormaliser stringNormaliserOverride, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightedEntryCombinerOverride, NonNullImmutableList <IModifyMatchWeights> propertyWeightAppliers, AutomatedIndexGeneratorFactory <TSource, TKey> .WeightDeterminerGenerator tokenWeightDeterminerGeneratorOverride, PropertyInfo optionalPropertyForFirstContentRetriever, bool captureSourceLocations, ILogger loggerOverride) { _keyRetrieverOverride = keyRetrieverOverride; _keyComparerOverride = keyComparerOverride; _stringNormaliserOverride = stringNormaliserOverride; _tokenBreaker = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker)); _weightedEntryCombinerOverride = weightedEntryCombinerOverride; _propertyWeightAppliers = propertyWeightAppliers ?? throw new ArgumentNullException(nameof(propertyWeightAppliers)); _tokenWeightDeterminerGeneratorOverride = tokenWeightDeterminerGeneratorOverride; _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever; _captureSourceLocations = captureSourceLocations; _loggerOverride = loggerOverride; }
/// <summary> /// This will break down a source search term into words (according to the logic of the specified token breaker) and then return matches where the words were found in a run in a /// content section. Unlike GetPartialMatches it is not possible for an entry to be considered a match because it contains all of the terms in its content, the terms must be /// present in one content field, together, in the order in which they are present in the search term. This allows for similar behaviour to that intended for the /// ConsecutiveTokenCombiningTokenBreaker, but this offers greater performance (constructing a TernarySearchTreeDictionary to back an IndexData instance can be expensive on /// processing time to generate, and disk / memory space to store, the runs of tokens). This also has the benefit that there is no cap on the number of tokens that can be /// matched consecutively (a limit on this had to be decided at index generation time when using the ConsecutiveTokenCombiningTokenBreaker). There are two sets of weight /// combining calculations required; the first (handled by the weightCombinerForConsecutiveRuns) determines a weight for run of consecutive tokens - each run is considered /// a single match, effectively. Each call to the first weight comber will have as many weights to combine as there are search terms, so if the "source" value is broken /// down into three words by the tokenBreaker then the weightCombinerForConsecutiveRuns will always be called with sets of three weights. The second weight combination /// is performed when multiple matches for a particular result must be combined to give a final match weight for that result. /// /// Note: This requires the index to have been built with source location data recorded - if the index's SourceLocationsAvailable property returns false then an ArgumentException /// will be thrown. /// </summary> public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>( this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightCombinerForConsecutiveRuns, IndexGenerator.WeightedEntryCombiner weightCombinerForFinalMatches) { if (index == null) { throw new ArgumentNullException("index"); } if (source == null) { throw new ArgumentNullException("source"); } if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (weightCombinerForConsecutiveRuns == null) { throw new ArgumentNullException("weightCombinerForConsecutiveRuns"); } if (weightCombinerForFinalMatches == null) { throw new ArgumentNullException("weightCombinerForFinalMatches"); } if (!index.SourceLocationsAvailable) { throw new ArgumentException($"The {nameof(index)} must include source location data in order to use identify Consecutive token matches"); } // If the token breaker won't actually translate the source value into multiple words then we can avoid all of the below work and just call index.GetMatches directly var weightAdjustedTokens = tokenBreaker.Break(source); if (weightAdjustedTokens.Count == 1) { return(index.GetMatches(source)); } // The index of this list will correspond to the index of the broken-down search terms var matchesForSearchTerms = new List <WeightedEntry <TKey>[]>(); foreach (var weightAdjustedToken in weightAdjustedTokens) { matchesForSearchTerms.Add( index.GetMatches(weightAdjustedToken.Token).Select(w => new WeightedEntry <TKey>( w.Key, w.Weight * weightAdjustedToken.WeightMultiplier, w.SourceLocationsIfRecorded )).ToArray() ); } // For each match of the first search term, try to identify a run of token matches for the same key and source field. Any such runs will be recorded in the consecutiveMatches // list - these represent content segments that match the entirety of the search term (the "source" argument). var consecutiveMatches = new List <WeightedEntry <TKey> >(); var searchTerms = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token)); foreach (var firstTermMatch in matchesForSearchTerms.First().SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations(m))) { var matchesForEntireTerm = NonNullImmutableList <WeightedEntry <TKey> > .Empty; matchesForEntireTerm = matchesForEntireTerm.Add(firstTermMatch); for (var termIndex = 1; termIndex < weightAdjustedTokens.Count; termIndex++) { // Note: SourceLocationsIfRecorded should never be null because we checked that the index reported that SourceLocationsAvailable was true (so not checking for null // source locations here) var nTermMatch = matchesForSearchTerms[termIndex] .SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations <TKey>(m)) .FirstOrDefault(m => index.KeyComparer.Equals(m.Key, firstTermMatch.Key) && (m.SourceLocationsIfRecorded.First().SourceFieldIndex == firstTermMatch.SourceLocationsIfRecorded.First().SourceFieldIndex) && (m.SourceLocationsIfRecorded.First().TokenIndex == firstTermMatch.SourceLocationsIfRecorded.First().TokenIndex + termIndex) ); if (nTermMatch == null) { break; } matchesForEntireTerm = matchesForEntireTerm.Add(nTermMatch); } if (matchesForEntireTerm.Count < weightAdjustedTokens.Count) { // If we didn't manage to get a full set of search terms then this isn't a full match continue; } // Combine the WeightedEntry instances that represent a run of individual matches (one for each word in the "source" argument) into a single WeightedEntry that represents // the entirety of the search term (each of the matchesForEntireTerm WeightedEntry instances will have only a single Source Location since the match data was split up // above by calling BreakWeightedEntryIntoIndividualSourceLocations before trying to find the consecutive matches). See notes above about not checking whether // SourceLocationsIfRecorded is null (it shouldn't be because we index.SourceLocationsAvailable at the top of this method) var sourceLocationOfFirstTerm = matchesForEntireTerm.First().SourceLocationsIfRecorded.Single(); var sourceLocationOfLastTerm = matchesForEntireTerm.Last().SourceLocationsIfRecorded.Single(); var matchWeightForConsecutiveRunEntry = weightCombinerForConsecutiveRuns( matchesForEntireTerm.Select(m => m.Weight).ToImmutableList() ); consecutiveMatches.Add( new WeightedEntry <TKey>( matchesForEntireTerm.First().Key, matchWeightForConsecutiveRunEntry, new NonNullImmutableList <SourceFieldLocation>(new[] { // Since we're creating a new SourceFieldLocation instance that is derived from a run of multiple tokens, the TokenIndex is going to be an approximation - // taking the TokenIndex from the first search term probably makes the most sense. The SourceIndex and SourceTokenLength will be taken such that the entire // run is covered (from the start of the first search term to the end of the last). Since this is the only Source Location instance for the WeightedEntry, // its MatchWeightContribution value is equal to the WeightedEntry's Weight. new SourceFieldLocation( sourceLocationOfFirstTerm.SourceFieldIndex, sourceLocationOfFirstTerm.TokenIndex, sourceLocationOfFirstTerm.SourceIndex, (sourceLocationOfLastTerm.SourceIndex + sourceLocationOfLastTerm.SourceTokenLength) - sourceLocationOfFirstTerm.SourceIndex, matchWeightForConsecutiveRunEntry ) }) ) ); } // The matches need grouping by key before returning return(consecutiveMatches .GroupBy(m => m.Key, index.KeyComparer) .Cast <IEnumerable <WeightedEntry <TKey> > >() .Select(matches => new WeightedEntry <TKey>( matches.First().Key, weightCombinerForFinalMatches( matches.Select(match => match.Weight).ToImmutableList() ), matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList() )) .ToNonNullImmutableList()); }
public AutomatedIndexGeneratorFactoryBuilder <TSource, TKey> SetWeightedEntryCombiner(IndexGenerator.WeightedEntryCombiner weightedEntryCombiner) { if (weightedEntryCombiner == null) { throw new ArgumentNullException("weightedEntryCombiner"); } return(new AutomatedIndexGeneratorFactoryBuilder <TSource, TKey>( _keyRetrieverOverride, _keyComparerOverride, _stringNormaliserOverride, _tokenBreaker, weightedEntryCombiner, _propertyWeightAppliers, _tokenWeightDeterminerGeneratorOverride, _optionalPropertyForFirstContentRetriever, _captureSourceLocations, _loggerOverride )); }
// The following aren't required for testing and so aren't implemented.. public IIndexData <int> Combine(NonNullImmutableList <IIndexData <int> > indexesToAdd, IndexGenerator.WeightedEntryCombiner weightCombiner) { throw new NotImplementedException(); }