Exemple #1
 public PartialMatchingTokenBreaker(
     int minLengthOfPartialMatches,
     int maxLengthOfPartialMatches,
     ITokenBreaker tokenBreaker,
     PartialMatchWeightDeterminer partialMatchWeightDeterminer) : this(minLengthOfPartialMatches, maxLengthOfPartialMatches, false, tokenBreaker, partialMatchWeightDeterminer)
 public QueryTranslator(
     IIndexData <TKey> standardMatchIndexData,
     IIndexData <TKey> preciseMatchIndexData,
     ITokenBreaker optionalQuotedValueConsecutiveTermTokenBreaker,
     MatchCombiner matchCombiner)
     : this(standardMatchIndexData, preciseMatchIndexData, optionalQuotedValueConsecutiveTermTokenBreaker, null, null, matchCombiner)
Exemple #3
 public PartialMatchingTokenBreaker(
     int minLengthOfPartialMatches,
     int maxLengthOfPartialMatches,
     bool fromStartOfTokenOnly,
     ITokenBreaker tokenBreaker,
     PartialMatchWeightDeterminer partialMatchWeightDeterminer) : this(minLengthOfPartialMatches, maxLengthOfPartialMatches, fromStartOfTokenOnly, tokenBreaker, null, partialMatchWeightDeterminer)
Exemple #4
        public WhiteSpaceExtendingTokenBreaker(ImmutableList <char> charsToTreatAsWhitespace, ITokenBreaker tokenBreaker)
            if (charsToTreatAsWhitespace == null)
                throw new ArgumentNullException("charsToTreatAsWhitespace");
            if (tokenBreaker == null)
                throw new ArgumentNullException("tokenBreaker");

            _charsToTreatAsWhitespace = charsToTreatAsWhitespace;
            _tokenBreaker             = tokenBreaker;
Exemple #5
 public IndexGenerator(
     NonNullImmutableList <ContentRetriever <TSource, TKey> > contentRetrievers,
     IEqualityComparer <TKey> dataKeyComparer,
     IStringNormaliser sourceStringComparer,
     ITokenBreaker tokenBreaker,
     IndexGenerator.WeightedEntryCombiner weightedEntryCombiner,
     bool captureSourceLocations,
     ILogger logger)
     _contentRetrievers      = contentRetrievers ?? throw new ArgumentNullException(nameof(contentRetrievers));
     _dataKeyComparer        = dataKeyComparer ?? throw new ArgumentNullException(nameof(dataKeyComparer));
     _sourceStringComparer   = sourceStringComparer ?? throw new ArgumentNullException(nameof(sourceStringComparer));
     _tokenBreaker           = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker));
     _weightedEntryCombiner  = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner));
     _captureSourceLocations = captureSourceLocations;
     _logger = logger ?? throw new ArgumentNullException(nameof(logger));
        public QueryTranslator(
            IIndexData <TKey> standardMatchIndexData,
            IIndexData <TKey> preciseMatchIndexData,
            ITokenBreaker optionalQuotedValueConsecutiveTermTokenBreaker,
            IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns,
            IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForFinalMatches,
            MatchCombiner matchCombiner)
            if (standardMatchIndexData == null)
                throw new ArgumentNullException("standardMatchIndexData");
            if (preciseMatchIndexData == null)
                throw new ArgumentNullException("preciseMatchIndexData");
            if (matchCombiner == null)
                throw new ArgumentNullException("matchCombiner");

            if (!preciseMatchIndexData.SourceLocationsAvailable)
                throw new ArgumentException($"The {nameof(preciseMatchIndexData)} must include source location data in order to use the Query Translator");

            // Can't actually determine for sure that the KeyComparer of the standardMatchIndexData is equivalent to that of the preciseMatchIndexData
            // (can't do an instance comparison since they may be different instances of the same implementation, they could even feasibly be different
            // classes with identical functionality) so we'll have to assume that the caller is behaving themselves. We'll take the KeyComparer of the
            // standardMatchIndexData for use when combining keys, excluding keys or otherwise processing the query segment requirements.
            _standardMatcher = new CachingResultMatcher(standardMatchIndexData.GetMatches);
            _preciseMatcher  = new CachingResultMatcher(
                source => preciseMatchIndexData.GetConsecutiveMatches(
                    optionalQuotedValueConsecutiveTermTokenBreaker ?? IndexData_Extensions_ConsecutiveMatches.DefaultTokenBreaker,
                    optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns ?? IndexData_Extensions_ConsecutiveMatches.DefaultConsecutiveRunsWeightCombiner,
                    optionalQuotedValueConsecutiveWeightCombinerForFinalMatches ?? IndexData_Extensions_ConsecutiveMatches.DefaultFinalMatchWeightCombiner
            _keyComparer   = standardMatchIndexData.KeyComparer;
            _matchCombiner = matchCombiner;
        public AutomatedIndexGeneratorFactoryBuilder <TSource, TKey> SetTokenBreaker(ITokenBreaker tokenBreaker)
            if (tokenBreaker == null)
                throw new ArgumentNullException("tokenBreaker");

            return(new AutomatedIndexGeneratorFactoryBuilder <TSource, TKey>(
        public PostIndexGenerator(ITokenBreaker tokenBreaker, IStringNormaliser sourceStringComparer, ILogger logger)
            if (tokenBreaker == null)
                throw new ArgumentNullException("tokenBreaker");
            if (sourceStringComparer == null)
                throw new ArgumentNullException("sourceStringComparer");
            if (logger == null)
                throw new ArgumentNullException("logger");

            _tokenBreaker         = tokenBreaker;
            _sourceStringComparer = sourceStringComparer;
            _stopWordLookup       = new HashSet <string>(Constants.GetStopWords("en"), _sourceStringComparer);      // TODO: Explain (if it helps)
            _logger = logger;
Exemple #9
 public AutomatedIndexGeneratorFactory(
     Func <TSource, TKey> keyRetriever,
     IEqualityComparer <TKey> keyComparer,
     IStringNormaliser stringNormaliser,
     ITokenBreaker tokenBreaker,
     IndexGenerator.WeightedEntryCombiner weightedEntryCombiner,
     WeightDeterminerGenerator brokenTokenWeightDeterminerGenerator,
     PropertyInfo optionalPropertyForFirstContentRetriever,
     bool captureSourceLocations,
     ILogger logger)
     _keyRetriever          = keyRetriever ?? throw new ArgumentNullException(nameof(keyRetriever));
     _keyComparer           = keyComparer ?? throw new ArgumentNullException(nameof(keyComparer));
     _stringNormaliser      = stringNormaliser ?? throw new ArgumentNullException(nameof(stringNormaliser));
     _tokenBreaker          = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker));
     _weightedEntryCombiner = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner));
     _brokenTokenWeightDeterminerGenerator     = brokenTokenWeightDeterminerGenerator ?? throw new ArgumentNullException(nameof(brokenTokenWeightDeterminerGenerator));
     _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever;
     _captureSourceLocations = captureSourceLocations;
     _logger = logger ?? throw new ArgumentNullException(nameof(logger));
        public ConsecutiveTokenCombiningTokenBreaker(
            ITokenBreaker tokenBreaker,
            int maxNumberOfTokens,
            WeightMultiplierDeterminer weightMultiplierDeterminer)
            if (tokenBreaker == null)
                throw new ArgumentNullException("tokenBreaker");
            if (maxNumberOfTokens < 1)
                throw new ArgumentOutOfRangeException("maxNumberOfTokens", "must be >= 1");
            if (weightMultiplierDeterminer == null)
                throw new ArgumentNullException("weightMultiplierDeterminer");

            _tokenBreaker               = tokenBreaker;
            _maxNumberOfTokens          = maxNumberOfTokens;
            _weightMultiplierDeterminer = weightMultiplierDeterminer;
 private AutomatedIndexGeneratorFactoryBuilder(
     Func <TSource, TKey> keyRetrieverOverride,
     IEqualityComparer <TKey> keyComparerOverride,
     IStringNormaliser stringNormaliserOverride,
     ITokenBreaker tokenBreaker,
     IndexGenerator.WeightedEntryCombiner weightedEntryCombinerOverride,
     NonNullImmutableList <IModifyMatchWeights> propertyWeightAppliers,
     AutomatedIndexGeneratorFactory <TSource, TKey> .WeightDeterminerGenerator tokenWeightDeterminerGeneratorOverride,
     PropertyInfo optionalPropertyForFirstContentRetriever,
     bool captureSourceLocations,
     ILogger loggerOverride)
     _keyRetrieverOverride                     = keyRetrieverOverride;
     _keyComparerOverride                      = keyComparerOverride;
     _stringNormaliserOverride                 = stringNormaliserOverride;
     _tokenBreaker                             = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker));
     _weightedEntryCombinerOverride            = weightedEntryCombinerOverride;
     _propertyWeightAppliers                   = propertyWeightAppliers ?? throw new ArgumentNullException(nameof(propertyWeightAppliers));
     _tokenWeightDeterminerGeneratorOverride   = tokenWeightDeterminerGeneratorOverride;
     _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever;
     _captureSourceLocations                   = captureSourceLocations;
     _loggerOverride                           = loggerOverride;
Exemple #12
        public PartialMatchingTokenBreaker(
            int minLengthOfPartialMatches,
            int maxLengthOfPartialMatches,
            bool fromStartOfTokenOnly,
            ITokenBreaker tokenBreaker,
            ITokenBreaker optionalPrePartialMatchTokenBreaker,
            PartialMatchWeightDeterminer partialMatchWeightDeterminer)
            if (minLengthOfPartialMatches <= 0)
                throw new ArgumentOutOfRangeException("minLengthOfPartialMatches", "must be greater than zero");
            if (maxLengthOfPartialMatches <= 0)
                throw new ArgumentOutOfRangeException("maxLengthOfPartialMatches", "must be greater than zero");
            if (maxLengthOfPartialMatches < minLengthOfPartialMatches)
                throw new ArgumentOutOfRangeException("maxLengthOfPartialMatches", "must be greater than minLengthOfPartialMatches");
            if (tokenBreaker == null)
                throw new ArgumentNullException("tokenBreaker");
            if (partialMatchWeightDeterminer == null)
                throw new ArgumentNullException("partialMatchWeightDeterminer");

            _minLengthOfPartialMatches = minLengthOfPartialMatches;
            _maxLengthOfPartialMatches = maxLengthOfPartialMatches;
            _fromStartOfTokenOnly      = fromStartOfTokenOnly;
            _tokenBreaker = tokenBreaker;
            _optionalPrePartialMatchTokenBreaker = optionalPrePartialMatchTokenBreaker;
            _partialMatchWeightDeterminer        = partialMatchWeightDeterminer;
Exemple #13
        /// <summary>
        /// This will break a given source string and return results based upon the combination of partial matches (so results that only match part of the source string may be included
        /// in the returned data). The token breaker and the match combiner must be specified by the caller - if the match combiner returns zero then the result will not be included in
        /// the final data. To require that all tokens in the source content be present for any returned results, the following matchCombiner could be specified:
        ///  (tokenMatches, allTokens) => (tokenMatches.Count &lt; allTokens.Count) ? 0 : tokenMatches.Sum(m => m.Weight)
        /// </summary>
        public static NonNullImmutableList <WeightedEntry <TKey> > GetPartialMatches <TKey>(
            this IIndexData <TKey> index,
            string source,
            ITokenBreaker tokenBreaker,
            WeightCombiner weightCombiner)
            if (index == null)
                throw new ArgumentNullException("index");
            if (source == null)
                throw new ArgumentNullException("source");
            if (tokenBreaker == null)
                throw new ArgumentNullException("tokenBreaker");
            if (weightCombiner == null)
                throw new ArgumentNullException("weightCombiner");

            // Break down the "source" search term and find matches for each token
            // - Each match maintains the weight multiplier applied to the string segment from the token breaker
            // - The Source Locations are annotated with additional data; the source segment string and what token index that is (so if the "source" value is broken into three, then
            //   each Source Location will have a SearchTerm property whose TokenIndex will be between 0 and 2, inclusive). This allows for a weightCombiner to be specified that
            //   ensures that every token that was extract from the source value can be matched against a given result, if so desired.
            var matches = new List <Tuple <WeightedEntry <TKey>, SearchTermDetails> >();
            var weightAdjustedTokens = tokenBreaker.Break(source);

            for (var tokenIndex = 0; tokenIndex < weightAdjustedTokens.Count; tokenIndex++)
                var weightAdjustedToken = weightAdjustedTokens[tokenIndex];
                    .Select(match => Tuple.Create(match, new SearchTermDetails(tokenIndex, weightAdjustedToken.Token)))

            // Combine per-search-term results, grouping by result key and calculating the match weight for each token using the specified weightCombiner (this may also be
            // used to filter out results; if a match weight of zero is returned then the match will be ignored - this may used to filter out results that only match two
            // out of three of the search terms, for example)
            var finalResults = NonNullImmutableList <WeightedEntry <TKey> > .Empty;
            var searchTerms  = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token));

            foreach (var matchesGroupedByKey in matches.GroupBy(m => m.Item1.Key, index.KeyComparer).Cast <IEnumerable <Tuple <WeightedEntry <TKey>, SearchTermDetails> > >())
                var combinedWeight = weightCombiner(
                    .Select(m => new MatchWeightWithSourceFieldLocations(
                if (combinedWeight < 0)
                    throw new ArgumentException("weightCombiner returned a negative value - invalid");
                else if (combinedWeight > 0)
                    finalResults = finalResults.Add(
                        new WeightedEntry <TKey>(
                            matchesGroupedByKey.Any(m => m.Item1.SourceLocationsIfRecorded == null)
                                                                ? null
                                                                : matchesGroupedByKey.SelectMany(m => m.Item1.SourceLocationsIfRecorded).ToNonNullImmutableList()
Exemple #14
        /// <summary>
        /// This will break down a source search term into words (according to the logic of the specified token breaker) and then return matches where the words were found in a run in a
        /// content section. Unlike GetPartialMatches it is not possible for an entry to be considered a match because it contains all of the terms in its content, the terms must be
        /// present in one content field, together, in the order in which they are present in the search term. This allows for similar behaviour to that intended for the
        /// ConsecutiveTokenCombiningTokenBreaker, but this offers greater performance (constructing a TernarySearchTreeDictionary to back an IndexData instance can be expensive on
        /// processing time to generate, and disk / memory space to store, the runs of tokens). This also has the benefit that there is no cap on the number of tokens that can be
        /// matched consecutively (a limit on this had to be decided at index generation time when using the ConsecutiveTokenCombiningTokenBreaker). There are two sets of weight
        /// combining calculations required; the first (handled by the weightCombinerForConsecutiveRuns) determines a weight for run of consecutive tokens - each run is considered
        /// a single match, effectively. Each call to the first weight comber will have as many weights to combine as there are search terms, so if the "source" value is broken
        /// down into three words by the tokenBreaker then the weightCombinerForConsecutiveRuns will always be called with sets of three weights. The second weight combination
        /// is performed when multiple matches for a particular result must be combined to give a final match weight for that result.
        /// Note: This requires the index to have been built with source location data recorded - if the index's SourceLocationsAvailable property returns false then an ArgumentException
        /// will be thrown.
        /// </summary>
        public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>(
            this IIndexData <TKey> index,
            string source,
            ITokenBreaker tokenBreaker,
            IndexGenerator.WeightedEntryCombiner weightCombinerForConsecutiveRuns,
            IndexGenerator.WeightedEntryCombiner weightCombinerForFinalMatches)
            if (index == null)
                throw new ArgumentNullException("index");
            if (source == null)
                throw new ArgumentNullException("source");
            if (tokenBreaker == null)
                throw new ArgumentNullException("tokenBreaker");
            if (weightCombinerForConsecutiveRuns == null)
                throw new ArgumentNullException("weightCombinerForConsecutiveRuns");
            if (weightCombinerForFinalMatches == null)
                throw new ArgumentNullException("weightCombinerForFinalMatches");

            if (!index.SourceLocationsAvailable)
                throw new ArgumentException($"The {nameof(index)} must include source location data in order to use identify Consecutive token matches");

            // If the token breaker won't actually translate the source value into multiple words then we can avoid all of the below work and just call index.GetMatches directly
            var weightAdjustedTokens = tokenBreaker.Break(source);

            if (weightAdjustedTokens.Count == 1)

            // The index of this list will correspond to the index of the broken-down search terms
            var matchesForSearchTerms = new List <WeightedEntry <TKey>[]>();

            foreach (var weightAdjustedToken in weightAdjustedTokens)
                    index.GetMatches(weightAdjustedToken.Token).Select(w => new WeightedEntry <TKey>(
                                                                           w.Weight * weightAdjustedToken.WeightMultiplier,

            // For each match of the first search term, try to identify a run of token matches for the same key and source field. Any such runs will be recorded in the consecutiveMatches
            // list - these represent content segments that match the entirety of the search term (the "source" argument).
            var consecutiveMatches = new List <WeightedEntry <TKey> >();
            var searchTerms        = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token));

            foreach (var firstTermMatch in matchesForSearchTerms.First().SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations(m)))
                var matchesForEntireTerm = NonNullImmutableList <WeightedEntry <TKey> > .Empty;
                matchesForEntireTerm = matchesForEntireTerm.Add(firstTermMatch);
                for (var termIndex = 1; termIndex < weightAdjustedTokens.Count; termIndex++)
                    // Note: SourceLocationsIfRecorded should never be null because we checked that the index reported that SourceLocationsAvailable was true (so not checking for null
                    // source locations here)
                    var nTermMatch = matchesForSearchTerms[termIndex]
                                     .SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations <TKey>(m))
                                     .FirstOrDefault(m =>
                                                     index.KeyComparer.Equals(m.Key, firstTermMatch.Key) &&
                                                     (m.SourceLocationsIfRecorded.First().SourceFieldIndex == firstTermMatch.SourceLocationsIfRecorded.First().SourceFieldIndex) &&
                                                     (m.SourceLocationsIfRecorded.First().TokenIndex == firstTermMatch.SourceLocationsIfRecorded.First().TokenIndex + termIndex)
                    if (nTermMatch == null)
                    matchesForEntireTerm = matchesForEntireTerm.Add(nTermMatch);
                if (matchesForEntireTerm.Count < weightAdjustedTokens.Count)
                    // If we didn't manage to get a full set of search terms then this isn't a full match

                // Combine the WeightedEntry instances that represent a run of individual matches (one for each word in the "source" argument) into a single WeightedEntry that represents
                // the entirety of the search term (each of the matchesForEntireTerm WeightedEntry instances will have only a single Source Location since the match data was split up
                // above by calling BreakWeightedEntryIntoIndividualSourceLocations before trying to find the consecutive matches). See notes above about not checking whether
                // SourceLocationsIfRecorded is null (it shouldn't be because we index.SourceLocationsAvailable at the top of this method)
                var sourceLocationOfFirstTerm         = matchesForEntireTerm.First().SourceLocationsIfRecorded.Single();
                var sourceLocationOfLastTerm          = matchesForEntireTerm.Last().SourceLocationsIfRecorded.Single();
                var matchWeightForConsecutiveRunEntry = weightCombinerForConsecutiveRuns(
                    matchesForEntireTerm.Select(m => m.Weight).ToImmutableList()
                    new WeightedEntry <TKey>(
                        new NonNullImmutableList <SourceFieldLocation>(new[]
                    // Since we're creating a new SourceFieldLocation instance that is derived from a run of multiple tokens, the TokenIndex is going to be an approximation -
                    // taking the TokenIndex from the first search term probably makes the most sense. The SourceIndex and SourceTokenLength will be taken such that the entire
                    // run is covered (from the start of the first search term to the end of the last). Since this is the only Source Location instance for the WeightedEntry,
                    // its MatchWeightContribution value is equal to the WeightedEntry's Weight.
                    new SourceFieldLocation(
                        (sourceLocationOfLastTerm.SourceIndex + sourceLocationOfLastTerm.SourceTokenLength) - sourceLocationOfFirstTerm.SourceIndex,

            // The matches need grouping by key before returning
                   .GroupBy(m => m.Key, index.KeyComparer)
                   .Cast <IEnumerable <WeightedEntry <TKey> > >()
                   .Select(matches => new WeightedEntry <TKey>(
                                   matches.Select(match => match.Weight).ToImmutableList()
                               matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList()
Exemple #15
 /// <summary>
 /// This GetConsecutiveMatches signature will call GetConsecutiveMatches specifying the DefaultConsecutiveRunsWeightCombiner and DefaultFinalMatchWeightCombiner
 /// for the weightCombiner arguments (the DefaultConsecutiveRunsWeightCombiner to calculate the combined weight of a run of tokens which should be considered as
 /// a single match and the DefaultFinalMatchWeightCombiner to combine all of these matches together for each result)
 /// </summary>
 public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>(this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker)
     return(GetConsecutiveMatches(index, source, tokenBreaker, DefaultConsecutiveRunsWeightCombiner, DefaultFinalMatchWeightCombiner));
Exemple #16
        private static IIndexData <int> GenerateIndexData(NonNullImmutableList <Post> posts, IStringNormaliser sourceStringComparer, ITokenBreaker tokenBreaker)
            if (posts == null)
                throw new ArgumentNullException(nameof(posts));
            if (sourceStringComparer == null)
                throw new ArgumentNullException(nameof(sourceStringComparer));
            if (tokenBreaker == null)
                throw new ArgumentNullException(nameof(tokenBreaker));

            // The Post (plain text) content is always the first field since its Content Retriever is first, this means that all source locations for the content
            // will have an SourceFieldIndex of zero
            var contentRetrievers = new List <ContentRetriever <Post, int> >
                new ContentRetriever <Post, int>(
                    p => new PreBrokenContent <int>(p.Id, p.GetContentAsPlainText()),
                    GetTokenWeightDeterminer(1f, sourceStringComparer)
                new ContentRetriever <Post, int>(
                    p => new PreBrokenContent <int>(p.Id, p.Title),
                    GetTokenWeightDeterminer(5f, sourceStringComparer)
                new ContentRetriever <Post, int>(
                    p => new PreBrokenContent <int>(p.Id, new NonNullOrEmptyStringList(p.Tags.Select(tag => tag.Tag))),
                    GetTokenWeightDeterminer(3f, sourceStringComparer)

            return(new IndexGenerator <Post, int>(
                       new DefaultEqualityComparer <int>(),
                       weightedValues => weightedValues.Sum(),
                       captureSourceLocations: true,
                       new NullLogger()
Exemple #17
 public WhiteSpaceTokenBreaker(ITokenBreaker optionalWrappedTokenBreaker)
     _optionalWrappedTokenBreaker = optionalWrappedTokenBreaker;