Esempio n. 1
0
 public IndexGenerator(
     NonNullImmutableList <ContentRetriever <TSource, TKey> > contentRetrievers,
     IEqualityComparer <TKey> dataKeyComparer,
     IStringNormaliser sourceStringComparer,
     ITokenBreaker tokenBreaker,
     IndexGenerator.WeightedEntryCombiner weightedEntryCombiner,
     bool captureSourceLocations,
     ILogger logger)
 {
     _contentRetrievers      = contentRetrievers ?? throw new ArgumentNullException(nameof(contentRetrievers));
     _dataKeyComparer        = dataKeyComparer ?? throw new ArgumentNullException(nameof(dataKeyComparer));
     _sourceStringComparer   = sourceStringComparer ?? throw new ArgumentNullException(nameof(sourceStringComparer));
     _tokenBreaker           = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker));
     _weightedEntryCombiner  = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner));
     _captureSourceLocations = captureSourceLocations;
     _logger = logger ?? throw new ArgumentNullException(nameof(logger));
 }
        public QueryTranslator(
            IIndexData <TKey> standardMatchIndexData,
            IIndexData <TKey> preciseMatchIndexData,
            ITokenBreaker optionalQuotedValueConsecutiveTermTokenBreaker,
            IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns,
            IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForFinalMatches,
            MatchCombiner matchCombiner)
        {
            if (standardMatchIndexData == null)
            {
                throw new ArgumentNullException("standardMatchIndexData");
            }
            if (preciseMatchIndexData == null)
            {
                throw new ArgumentNullException("preciseMatchIndexData");
            }
            if (matchCombiner == null)
            {
                throw new ArgumentNullException("matchCombiner");
            }

            if (!preciseMatchIndexData.SourceLocationsAvailable)
            {
                throw new ArgumentException($"The {nameof(preciseMatchIndexData)} must include source location data in order to use the Query Translator");
            }

            // Can't actually determine for sure that the KeyComparer of the standardMatchIndexData is equivalent to that of the preciseMatchIndexData
            // (can't do an instance comparison since they may be different instances of the same implementation, they could even feasibly be different
            // classes with identical functionality) so we'll have to assume that the caller is behaving themselves. We'll take the KeyComparer of the
            // standardMatchIndexData for use when combining keys, excluding keys or otherwise processing the query segment requirements.
            _standardMatcher = new CachingResultMatcher(standardMatchIndexData.GetMatches);
            _preciseMatcher  = new CachingResultMatcher(
                source => preciseMatchIndexData.GetConsecutiveMatches(
                    source,
                    optionalQuotedValueConsecutiveTermTokenBreaker ?? IndexData_Extensions_ConsecutiveMatches.DefaultTokenBreaker,
                    optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns ?? IndexData_Extensions_ConsecutiveMatches.DefaultConsecutiveRunsWeightCombiner,
                    optionalQuotedValueConsecutiveWeightCombinerForFinalMatches ?? IndexData_Extensions_ConsecutiveMatches.DefaultFinalMatchWeightCombiner
                    )
                );
            _keyComparer   = standardMatchIndexData.KeyComparer;
            _matchCombiner = matchCombiner;
        }
Esempio n. 3
0
 public AutomatedIndexGeneratorFactory(
     Func <TSource, TKey> keyRetriever,
     IEqualityComparer <TKey> keyComparer,
     IStringNormaliser stringNormaliser,
     ITokenBreaker tokenBreaker,
     IndexGenerator.WeightedEntryCombiner weightedEntryCombiner,
     WeightDeterminerGenerator brokenTokenWeightDeterminerGenerator,
     PropertyInfo optionalPropertyForFirstContentRetriever,
     bool captureSourceLocations,
     ILogger logger)
 {
     _keyRetriever          = keyRetriever ?? throw new ArgumentNullException(nameof(keyRetriever));
     _keyComparer           = keyComparer ?? throw new ArgumentNullException(nameof(keyComparer));
     _stringNormaliser      = stringNormaliser ?? throw new ArgumentNullException(nameof(stringNormaliser));
     _tokenBreaker          = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker));
     _weightedEntryCombiner = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner));
     _brokenTokenWeightDeterminerGenerator     = brokenTokenWeightDeterminerGenerator ?? throw new ArgumentNullException(nameof(brokenTokenWeightDeterminerGenerator));
     _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever;
     _captureSourceLocations = captureSourceLocations;
     _logger = logger ?? throw new ArgumentNullException(nameof(logger));
 }
 private AutomatedIndexGeneratorFactoryBuilder(
     Func <TSource, TKey> keyRetrieverOverride,
     IEqualityComparer <TKey> keyComparerOverride,
     IStringNormaliser stringNormaliserOverride,
     ITokenBreaker tokenBreaker,
     IndexGenerator.WeightedEntryCombiner weightedEntryCombinerOverride,
     NonNullImmutableList <IModifyMatchWeights> propertyWeightAppliers,
     AutomatedIndexGeneratorFactory <TSource, TKey> .WeightDeterminerGenerator tokenWeightDeterminerGeneratorOverride,
     PropertyInfo optionalPropertyForFirstContentRetriever,
     bool captureSourceLocations,
     ILogger loggerOverride)
 {
     _keyRetrieverOverride                     = keyRetrieverOverride;
     _keyComparerOverride                      = keyComparerOverride;
     _stringNormaliserOverride                 = stringNormaliserOverride;
     _tokenBreaker                             = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker));
     _weightedEntryCombinerOverride            = weightedEntryCombinerOverride;
     _propertyWeightAppliers                   = propertyWeightAppliers ?? throw new ArgumentNullException(nameof(propertyWeightAppliers));
     _tokenWeightDeterminerGeneratorOverride   = tokenWeightDeterminerGeneratorOverride;
     _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever;
     _captureSourceLocations                   = captureSourceLocations;
     _loggerOverride                           = loggerOverride;
 }
Esempio n. 5
0
        /// <summary>
        /// This will break down a source search term into words (according to the logic of the specified token breaker) and then return matches where the words were found in a run in a
        /// content section. Unlike GetPartialMatches it is not possible for an entry to be considered a match because it contains all of the terms in its content, the terms must be
        /// present in one content field, together, in the order in which they are present in the search term. This allows for similar behaviour to that intended for the
        /// ConsecutiveTokenCombiningTokenBreaker, but this offers greater performance (constructing a TernarySearchTreeDictionary to back an IndexData instance can be expensive on
        /// processing time to generate, and disk / memory space to store, the runs of tokens). This also has the benefit that there is no cap on the number of tokens that can be
        /// matched consecutively (a limit on this had to be decided at index generation time when using the ConsecutiveTokenCombiningTokenBreaker). There are two sets of weight
        /// combining calculations required; the first (handled by the weightCombinerForConsecutiveRuns) determines a weight for run of consecutive tokens - each run is considered
        /// a single match, effectively. Each call to the first weight comber will have as many weights to combine as there are search terms, so if the "source" value is broken
        /// down into three words by the tokenBreaker then the weightCombinerForConsecutiveRuns will always be called with sets of three weights. The second weight combination
        /// is performed when multiple matches for a particular result must be combined to give a final match weight for that result.
        ///
        /// Note: This requires the index to have been built with source location data recorded - if the index's SourceLocationsAvailable property returns false then an ArgumentException
        /// will be thrown.
        /// </summary>
        public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>(
            this IIndexData <TKey> index,
            string source,
            ITokenBreaker tokenBreaker,
            IndexGenerator.WeightedEntryCombiner weightCombinerForConsecutiveRuns,
            IndexGenerator.WeightedEntryCombiner weightCombinerForFinalMatches)
        {
            if (index == null)
            {
                throw new ArgumentNullException("index");
            }
            if (source == null)
            {
                throw new ArgumentNullException("source");
            }
            if (tokenBreaker == null)
            {
                throw new ArgumentNullException("tokenBreaker");
            }
            if (weightCombinerForConsecutiveRuns == null)
            {
                throw new ArgumentNullException("weightCombinerForConsecutiveRuns");
            }
            if (weightCombinerForFinalMatches == null)
            {
                throw new ArgumentNullException("weightCombinerForFinalMatches");
            }

            if (!index.SourceLocationsAvailable)
            {
                throw new ArgumentException($"The {nameof(index)} must include source location data in order to use identify Consecutive token matches");
            }

            // If the token breaker won't actually translate the source value into multiple words then we can avoid all of the below work and just call index.GetMatches directly
            var weightAdjustedTokens = tokenBreaker.Break(source);

            if (weightAdjustedTokens.Count == 1)
            {
                return(index.GetMatches(source));
            }

            // The index of this list will correspond to the index of the broken-down search terms
            var matchesForSearchTerms = new List <WeightedEntry <TKey>[]>();

            foreach (var weightAdjustedToken in weightAdjustedTokens)
            {
                matchesForSearchTerms.Add(
                    index.GetMatches(weightAdjustedToken.Token).Select(w => new WeightedEntry <TKey>(
                                                                           w.Key,
                                                                           w.Weight * weightAdjustedToken.WeightMultiplier,
                                                                           w.SourceLocationsIfRecorded
                                                                           )).ToArray()
                    );
            }

            // For each match of the first search term, try to identify a run of token matches for the same key and source field. Any such runs will be recorded in the consecutiveMatches
            // list - these represent content segments that match the entirety of the search term (the "source" argument).
            var consecutiveMatches = new List <WeightedEntry <TKey> >();
            var searchTerms        = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token));

            foreach (var firstTermMatch in matchesForSearchTerms.First().SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations(m)))
            {
                var matchesForEntireTerm = NonNullImmutableList <WeightedEntry <TKey> > .Empty;
                matchesForEntireTerm = matchesForEntireTerm.Add(firstTermMatch);
                for (var termIndex = 1; termIndex < weightAdjustedTokens.Count; termIndex++)
                {
                    // Note: SourceLocationsIfRecorded should never be null because we checked that the index reported that SourceLocationsAvailable was true (so not checking for null
                    // source locations here)
                    var nTermMatch = matchesForSearchTerms[termIndex]
                                     .SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations <TKey>(m))
                                     .FirstOrDefault(m =>
                                                     index.KeyComparer.Equals(m.Key, firstTermMatch.Key) &&
                                                     (m.SourceLocationsIfRecorded.First().SourceFieldIndex == firstTermMatch.SourceLocationsIfRecorded.First().SourceFieldIndex) &&
                                                     (m.SourceLocationsIfRecorded.First().TokenIndex == firstTermMatch.SourceLocationsIfRecorded.First().TokenIndex + termIndex)
                                                     );
                    if (nTermMatch == null)
                    {
                        break;
                    }
                    matchesForEntireTerm = matchesForEntireTerm.Add(nTermMatch);
                }
                if (matchesForEntireTerm.Count < weightAdjustedTokens.Count)
                {
                    // If we didn't manage to get a full set of search terms then this isn't a full match
                    continue;
                }

                // Combine the WeightedEntry instances that represent a run of individual matches (one for each word in the "source" argument) into a single WeightedEntry that represents
                // the entirety of the search term (each of the matchesForEntireTerm WeightedEntry instances will have only a single Source Location since the match data was split up
                // above by calling BreakWeightedEntryIntoIndividualSourceLocations before trying to find the consecutive matches). See notes above about not checking whether
                // SourceLocationsIfRecorded is null (it shouldn't be because we index.SourceLocationsAvailable at the top of this method)
                var sourceLocationOfFirstTerm         = matchesForEntireTerm.First().SourceLocationsIfRecorded.Single();
                var sourceLocationOfLastTerm          = matchesForEntireTerm.Last().SourceLocationsIfRecorded.Single();
                var matchWeightForConsecutiveRunEntry = weightCombinerForConsecutiveRuns(
                    matchesForEntireTerm.Select(m => m.Weight).ToImmutableList()
                    );
                consecutiveMatches.Add(
                    new WeightedEntry <TKey>(
                        matchesForEntireTerm.First().Key,
                        matchWeightForConsecutiveRunEntry,
                        new NonNullImmutableList <SourceFieldLocation>(new[]
                {
                    // Since we're creating a new SourceFieldLocation instance that is derived from a run of multiple tokens, the TokenIndex is going to be an approximation -
                    // taking the TokenIndex from the first search term probably makes the most sense. The SourceIndex and SourceTokenLength will be taken such that the entire
                    // run is covered (from the start of the first search term to the end of the last). Since this is the only Source Location instance for the WeightedEntry,
                    // its MatchWeightContribution value is equal to the WeightedEntry's Weight.
                    new SourceFieldLocation(
                        sourceLocationOfFirstTerm.SourceFieldIndex,
                        sourceLocationOfFirstTerm.TokenIndex,
                        sourceLocationOfFirstTerm.SourceIndex,
                        (sourceLocationOfLastTerm.SourceIndex + sourceLocationOfLastTerm.SourceTokenLength) - sourceLocationOfFirstTerm.SourceIndex,
                        matchWeightForConsecutiveRunEntry
                        )
                })
                        )
                    );
            }

            // The matches need grouping by key before returning
            return(consecutiveMatches
                   .GroupBy(m => m.Key, index.KeyComparer)
                   .Cast <IEnumerable <WeightedEntry <TKey> > >()
                   .Select(matches => new WeightedEntry <TKey>(
                               matches.First().Key,
                               weightCombinerForFinalMatches(
                                   matches.Select(match => match.Weight).ToImmutableList()
                                   ),
                               matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList()
                               ))
                   .ToNonNullImmutableList());
        }
        public AutomatedIndexGeneratorFactoryBuilder <TSource, TKey> SetWeightedEntryCombiner(IndexGenerator.WeightedEntryCombiner weightedEntryCombiner)
        {
            if (weightedEntryCombiner == null)
            {
                throw new ArgumentNullException("weightedEntryCombiner");
            }

            return(new AutomatedIndexGeneratorFactoryBuilder <TSource, TKey>(
                       _keyRetrieverOverride,
                       _keyComparerOverride,
                       _stringNormaliserOverride,
                       _tokenBreaker,
                       weightedEntryCombiner,
                       _propertyWeightAppliers,
                       _tokenWeightDeterminerGeneratorOverride,
                       _optionalPropertyForFirstContentRetriever,
                       _captureSourceLocations,
                       _loggerOverride
                       ));
        }
Esempio n. 7
0
 // The following aren't required for testing and so aren't implemented..
 public IIndexData <int> Combine(NonNullImmutableList <IIndexData <int> > indexesToAdd, IndexGenerator.WeightedEntryCombiner weightCombiner)
 {
     throw new NotImplementedException();
 }