public PartialMatchingTokenBreaker( int minLengthOfPartialMatches, int maxLengthOfPartialMatches, ITokenBreaker tokenBreaker, PartialMatchWeightDeterminer partialMatchWeightDeterminer) : this(minLengthOfPartialMatches, maxLengthOfPartialMatches, false, tokenBreaker, partialMatchWeightDeterminer) { }
public QueryTranslator( IIndexData <TKey> standardMatchIndexData, IIndexData <TKey> preciseMatchIndexData, ITokenBreaker optionalQuotedValueConsecutiveTermTokenBreaker, MatchCombiner matchCombiner) : this(standardMatchIndexData, preciseMatchIndexData, optionalQuotedValueConsecutiveTermTokenBreaker, null, null, matchCombiner) { }
public PartialMatchingTokenBreaker( int minLengthOfPartialMatches, int maxLengthOfPartialMatches, bool fromStartOfTokenOnly, ITokenBreaker tokenBreaker, PartialMatchWeightDeterminer partialMatchWeightDeterminer) : this(minLengthOfPartialMatches, maxLengthOfPartialMatches, fromStartOfTokenOnly, tokenBreaker, null, partialMatchWeightDeterminer) { }
public WhiteSpaceExtendingTokenBreaker(ImmutableList <char> charsToTreatAsWhitespace, ITokenBreaker tokenBreaker) { if (charsToTreatAsWhitespace == null) { throw new ArgumentNullException("charsToTreatAsWhitespace"); } if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } _charsToTreatAsWhitespace = charsToTreatAsWhitespace; _tokenBreaker = tokenBreaker; }
public IndexGenerator( NonNullImmutableList <ContentRetriever <TSource, TKey> > contentRetrievers, IEqualityComparer <TKey> dataKeyComparer, IStringNormaliser sourceStringComparer, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightedEntryCombiner, bool captureSourceLocations, ILogger logger) { _contentRetrievers = contentRetrievers ?? throw new ArgumentNullException(nameof(contentRetrievers)); _dataKeyComparer = dataKeyComparer ?? throw new ArgumentNullException(nameof(dataKeyComparer)); _sourceStringComparer = sourceStringComparer ?? throw new ArgumentNullException(nameof(sourceStringComparer)); _tokenBreaker = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker)); _weightedEntryCombiner = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner)); _captureSourceLocations = captureSourceLocations; _logger = logger ?? throw new ArgumentNullException(nameof(logger)); }
public QueryTranslator( IIndexData <TKey> standardMatchIndexData, IIndexData <TKey> preciseMatchIndexData, ITokenBreaker optionalQuotedValueConsecutiveTermTokenBreaker, IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns, IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForFinalMatches, MatchCombiner matchCombiner) { if (standardMatchIndexData == null) { throw new ArgumentNullException("standardMatchIndexData"); } if (preciseMatchIndexData == null) { throw new ArgumentNullException("preciseMatchIndexData"); } if (matchCombiner == null) { throw new ArgumentNullException("matchCombiner"); } if (!preciseMatchIndexData.SourceLocationsAvailable) { throw new ArgumentException($"The {nameof(preciseMatchIndexData)} must include source location data in order to use the Query Translator"); } // Can't actually determine for sure that the KeyComparer of the standardMatchIndexData is equivalent to that of the preciseMatchIndexData // (can't do an instance comparison since they may be different instances of the same implementation, they could even feasibly be different // classes with identical functionality) so we'll have to assume that the caller is behaving themselves. We'll take the KeyComparer of the // standardMatchIndexData for use when combining keys, excluding keys or otherwise processing the query segment requirements. _standardMatcher = new CachingResultMatcher(standardMatchIndexData.GetMatches); _preciseMatcher = new CachingResultMatcher( source => preciseMatchIndexData.GetConsecutiveMatches( source, optionalQuotedValueConsecutiveTermTokenBreaker ?? IndexData_Extensions_ConsecutiveMatches.DefaultTokenBreaker, optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns ?? IndexData_Extensions_ConsecutiveMatches.DefaultConsecutiveRunsWeightCombiner, optionalQuotedValueConsecutiveWeightCombinerForFinalMatches ?? IndexData_Extensions_ConsecutiveMatches.DefaultFinalMatchWeightCombiner ) ); _keyComparer = standardMatchIndexData.KeyComparer; _matchCombiner = matchCombiner; }
public AutomatedIndexGeneratorFactoryBuilder <TSource, TKey> SetTokenBreaker(ITokenBreaker tokenBreaker) { if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } return(new AutomatedIndexGeneratorFactoryBuilder <TSource, TKey>( _keyRetrieverOverride, _keyComparerOverride, _stringNormaliserOverride, tokenBreaker, _weightedEntryCombinerOverride, _propertyWeightAppliers, _tokenWeightDeterminerGeneratorOverride, _optionalPropertyForFirstContentRetriever, _captureSourceLocations, _loggerOverride )); }
public PostIndexGenerator(ITokenBreaker tokenBreaker, IStringNormaliser sourceStringComparer, ILogger logger) { if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (sourceStringComparer == null) { throw new ArgumentNullException("sourceStringComparer"); } if (logger == null) { throw new ArgumentNullException("logger"); } _tokenBreaker = tokenBreaker; _sourceStringComparer = sourceStringComparer; _stopWordLookup = new HashSet <string>(Constants.GetStopWords("en"), _sourceStringComparer); // TODO: Explain (if it helps) _logger = logger; }
public AutomatedIndexGeneratorFactory( Func <TSource, TKey> keyRetriever, IEqualityComparer <TKey> keyComparer, IStringNormaliser stringNormaliser, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightedEntryCombiner, WeightDeterminerGenerator brokenTokenWeightDeterminerGenerator, PropertyInfo optionalPropertyForFirstContentRetriever, bool captureSourceLocations, ILogger logger) { _keyRetriever = keyRetriever ?? throw new ArgumentNullException(nameof(keyRetriever)); _keyComparer = keyComparer ?? throw new ArgumentNullException(nameof(keyComparer)); _stringNormaliser = stringNormaliser ?? throw new ArgumentNullException(nameof(stringNormaliser)); _tokenBreaker = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker)); _weightedEntryCombiner = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner)); _brokenTokenWeightDeterminerGenerator = brokenTokenWeightDeterminerGenerator ?? throw new ArgumentNullException(nameof(brokenTokenWeightDeterminerGenerator)); _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever; _captureSourceLocations = captureSourceLocations; _logger = logger ?? throw new ArgumentNullException(nameof(logger)); }
public ConsecutiveTokenCombiningTokenBreaker( ITokenBreaker tokenBreaker, int maxNumberOfTokens, WeightMultiplierDeterminer weightMultiplierDeterminer) { if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (maxNumberOfTokens < 1) { throw new ArgumentOutOfRangeException("maxNumberOfTokens", "must be >= 1"); } if (weightMultiplierDeterminer == null) { throw new ArgumentNullException("weightMultiplierDeterminer"); } _tokenBreaker = tokenBreaker; _maxNumberOfTokens = maxNumberOfTokens; _weightMultiplierDeterminer = weightMultiplierDeterminer; }
private AutomatedIndexGeneratorFactoryBuilder( Func <TSource, TKey> keyRetrieverOverride, IEqualityComparer <TKey> keyComparerOverride, IStringNormaliser stringNormaliserOverride, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightedEntryCombinerOverride, NonNullImmutableList <IModifyMatchWeights> propertyWeightAppliers, AutomatedIndexGeneratorFactory <TSource, TKey> .WeightDeterminerGenerator tokenWeightDeterminerGeneratorOverride, PropertyInfo optionalPropertyForFirstContentRetriever, bool captureSourceLocations, ILogger loggerOverride) { _keyRetrieverOverride = keyRetrieverOverride; _keyComparerOverride = keyComparerOverride; _stringNormaliserOverride = stringNormaliserOverride; _tokenBreaker = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker)); _weightedEntryCombinerOverride = weightedEntryCombinerOverride; _propertyWeightAppliers = propertyWeightAppliers ?? throw new ArgumentNullException(nameof(propertyWeightAppliers)); _tokenWeightDeterminerGeneratorOverride = tokenWeightDeterminerGeneratorOverride; _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever; _captureSourceLocations = captureSourceLocations; _loggerOverride = loggerOverride; }
public PartialMatchingTokenBreaker( int minLengthOfPartialMatches, int maxLengthOfPartialMatches, bool fromStartOfTokenOnly, ITokenBreaker tokenBreaker, ITokenBreaker optionalPrePartialMatchTokenBreaker, PartialMatchWeightDeterminer partialMatchWeightDeterminer) { if (minLengthOfPartialMatches <= 0) { throw new ArgumentOutOfRangeException("minLengthOfPartialMatches", "must be greater than zero"); } if (maxLengthOfPartialMatches <= 0) { throw new ArgumentOutOfRangeException("maxLengthOfPartialMatches", "must be greater than zero"); } if (maxLengthOfPartialMatches < minLengthOfPartialMatches) { throw new ArgumentOutOfRangeException("maxLengthOfPartialMatches", "must be greater than minLengthOfPartialMatches"); } if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (partialMatchWeightDeterminer == null) { throw new ArgumentNullException("partialMatchWeightDeterminer"); } _minLengthOfPartialMatches = minLengthOfPartialMatches; _maxLengthOfPartialMatches = maxLengthOfPartialMatches; _fromStartOfTokenOnly = fromStartOfTokenOnly; _tokenBreaker = tokenBreaker; _optionalPrePartialMatchTokenBreaker = optionalPrePartialMatchTokenBreaker; _partialMatchWeightDeterminer = partialMatchWeightDeterminer; }
/// <summary> /// This will break a given source string and return results based upon the combination of partial matches (so results that only match part of the source string may be included /// in the returned data). The token breaker and the match combiner must be specified by the caller - if the match combiner returns zero then the result will not be included in /// the final data. To require that all tokens in the source content be present for any returned results, the following matchCombiner could be specified: /// (tokenMatches, allTokens) => (tokenMatches.Count < allTokens.Count) ? 0 : tokenMatches.Sum(m => m.Weight) /// </summary> public static NonNullImmutableList <WeightedEntry <TKey> > GetPartialMatches <TKey>( this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker, WeightCombiner weightCombiner) { if (index == null) { throw new ArgumentNullException("index"); } if (source == null) { throw new ArgumentNullException("source"); } if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (weightCombiner == null) { throw new ArgumentNullException("weightCombiner"); } // Break down the "source" search term and find matches for each token // - Each match maintains the weight multiplier applied to the string segment from the token breaker // - The Source Locations are annotated with additional data; the source segment string and what token index that is (so if the "source" value is broken into three, then // each Source Location will have a SearchTerm property whose TokenIndex will be between 0 and 2, inclusive). This allows for a weightCombiner to be specified that // ensures that every token that was extract from the source value can be matched against a given result, if so desired. var matches = new List <Tuple <WeightedEntry <TKey>, SearchTermDetails> >(); var weightAdjustedTokens = tokenBreaker.Break(source); for (var tokenIndex = 0; tokenIndex < weightAdjustedTokens.Count; tokenIndex++) { var weightAdjustedToken = weightAdjustedTokens[tokenIndex]; matches.AddRange( index .GetMatches(weightAdjustedToken.Token) .Select(match => Tuple.Create(match, new SearchTermDetails(tokenIndex, weightAdjustedToken.Token))) ); } // Combine per-search-term results, grouping by result key and calculating the match weight for each token using the specified weightCombiner (this may also be // used to filter out results; if a match weight of zero is returned then the match will be ignored - this may used to filter out results that only match two // out of three of the search terms, for example) var finalResults = NonNullImmutableList <WeightedEntry <TKey> > .Empty; var searchTerms = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token)); foreach (var matchesGroupedByKey in matches.GroupBy(m => m.Item1.Key, index.KeyComparer).Cast <IEnumerable <Tuple <WeightedEntry <TKey>, SearchTermDetails> > >()) { var combinedWeight = weightCombiner( matchesGroupedByKey .Select(m => new MatchWeightWithSourceFieldLocations( m.Item1.Weight, m.Item2, m.Item1.SourceLocationsIfRecorded )).ToNonNullImmutableList(), searchTerms ); if (combinedWeight < 0) { throw new ArgumentException("weightCombiner returned a negative value - invalid"); } else if (combinedWeight > 0) { finalResults = finalResults.Add( new WeightedEntry <TKey>( matchesGroupedByKey.First().Item1.Key, combinedWeight, matchesGroupedByKey.Any(m => m.Item1.SourceLocationsIfRecorded == null) ? null : matchesGroupedByKey.SelectMany(m => m.Item1.SourceLocationsIfRecorded).ToNonNullImmutableList() ) ); } } return(finalResults); }
/// <summary> /// This will break down a source search term into words (according to the logic of the specified token breaker) and then return matches where the words were found in a run in a /// content section. Unlike GetPartialMatches it is not possible for an entry to be considered a match because it contains all of the terms in its content, the terms must be /// present in one content field, together, in the order in which they are present in the search term. This allows for similar behaviour to that intended for the /// ConsecutiveTokenCombiningTokenBreaker, but this offers greater performance (constructing a TernarySearchTreeDictionary to back an IndexData instance can be expensive on /// processing time to generate, and disk / memory space to store, the runs of tokens). This also has the benefit that there is no cap on the number of tokens that can be /// matched consecutively (a limit on this had to be decided at index generation time when using the ConsecutiveTokenCombiningTokenBreaker). There are two sets of weight /// combining calculations required; the first (handled by the weightCombinerForConsecutiveRuns) determines a weight for run of consecutive tokens - each run is considered /// a single match, effectively. Each call to the first weight comber will have as many weights to combine as there are search terms, so if the "source" value is broken /// down into three words by the tokenBreaker then the weightCombinerForConsecutiveRuns will always be called with sets of three weights. The second weight combination /// is performed when multiple matches for a particular result must be combined to give a final match weight for that result. /// /// Note: This requires the index to have been built with source location data recorded - if the index's SourceLocationsAvailable property returns false then an ArgumentException /// will be thrown. /// </summary> public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>( this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightCombinerForConsecutiveRuns, IndexGenerator.WeightedEntryCombiner weightCombinerForFinalMatches) { if (index == null) { throw new ArgumentNullException("index"); } if (source == null) { throw new ArgumentNullException("source"); } if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (weightCombinerForConsecutiveRuns == null) { throw new ArgumentNullException("weightCombinerForConsecutiveRuns"); } if (weightCombinerForFinalMatches == null) { throw new ArgumentNullException("weightCombinerForFinalMatches"); } if (!index.SourceLocationsAvailable) { throw new ArgumentException($"The {nameof(index)} must include source location data in order to use identify Consecutive token matches"); } // If the token breaker won't actually translate the source value into multiple words then we can avoid all of the below work and just call index.GetMatches directly var weightAdjustedTokens = tokenBreaker.Break(source); if (weightAdjustedTokens.Count == 1) { return(index.GetMatches(source)); } // The index of this list will correspond to the index of the broken-down search terms var matchesForSearchTerms = new List <WeightedEntry <TKey>[]>(); foreach (var weightAdjustedToken in weightAdjustedTokens) { matchesForSearchTerms.Add( index.GetMatches(weightAdjustedToken.Token).Select(w => new WeightedEntry <TKey>( w.Key, w.Weight * weightAdjustedToken.WeightMultiplier, w.SourceLocationsIfRecorded )).ToArray() ); } // For each match of the first search term, try to identify a run of token matches for the same key and source field. Any such runs will be recorded in the consecutiveMatches // list - these represent content segments that match the entirety of the search term (the "source" argument). var consecutiveMatches = new List <WeightedEntry <TKey> >(); var searchTerms = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token)); foreach (var firstTermMatch in matchesForSearchTerms.First().SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations(m))) { var matchesForEntireTerm = NonNullImmutableList <WeightedEntry <TKey> > .Empty; matchesForEntireTerm = matchesForEntireTerm.Add(firstTermMatch); for (var termIndex = 1; termIndex < weightAdjustedTokens.Count; termIndex++) { // Note: SourceLocationsIfRecorded should never be null because we checked that the index reported that SourceLocationsAvailable was true (so not checking for null // source locations here) var nTermMatch = matchesForSearchTerms[termIndex] .SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations <TKey>(m)) .FirstOrDefault(m => index.KeyComparer.Equals(m.Key, firstTermMatch.Key) && (m.SourceLocationsIfRecorded.First().SourceFieldIndex == firstTermMatch.SourceLocationsIfRecorded.First().SourceFieldIndex) && (m.SourceLocationsIfRecorded.First().TokenIndex == firstTermMatch.SourceLocationsIfRecorded.First().TokenIndex + termIndex) ); if (nTermMatch == null) { break; } matchesForEntireTerm = matchesForEntireTerm.Add(nTermMatch); } if (matchesForEntireTerm.Count < weightAdjustedTokens.Count) { // If we didn't manage to get a full set of search terms then this isn't a full match continue; } // Combine the WeightedEntry instances that represent a run of individual matches (one for each word in the "source" argument) into a single WeightedEntry that represents // the entirety of the search term (each of the matchesForEntireTerm WeightedEntry instances will have only a single Source Location since the match data was split up // above by calling BreakWeightedEntryIntoIndividualSourceLocations before trying to find the consecutive matches). See notes above about not checking whether // SourceLocationsIfRecorded is null (it shouldn't be because we index.SourceLocationsAvailable at the top of this method) var sourceLocationOfFirstTerm = matchesForEntireTerm.First().SourceLocationsIfRecorded.Single(); var sourceLocationOfLastTerm = matchesForEntireTerm.Last().SourceLocationsIfRecorded.Single(); var matchWeightForConsecutiveRunEntry = weightCombinerForConsecutiveRuns( matchesForEntireTerm.Select(m => m.Weight).ToImmutableList() ); consecutiveMatches.Add( new WeightedEntry <TKey>( matchesForEntireTerm.First().Key, matchWeightForConsecutiveRunEntry, new NonNullImmutableList <SourceFieldLocation>(new[] { // Since we're creating a new SourceFieldLocation instance that is derived from a run of multiple tokens, the TokenIndex is going to be an approximation - // taking the TokenIndex from the first search term probably makes the most sense. The SourceIndex and SourceTokenLength will be taken such that the entire // run is covered (from the start of the first search term to the end of the last). Since this is the only Source Location instance for the WeightedEntry, // its MatchWeightContribution value is equal to the WeightedEntry's Weight. new SourceFieldLocation( sourceLocationOfFirstTerm.SourceFieldIndex, sourceLocationOfFirstTerm.TokenIndex, sourceLocationOfFirstTerm.SourceIndex, (sourceLocationOfLastTerm.SourceIndex + sourceLocationOfLastTerm.SourceTokenLength) - sourceLocationOfFirstTerm.SourceIndex, matchWeightForConsecutiveRunEntry ) }) ) ); } // The matches need grouping by key before returning return(consecutiveMatches .GroupBy(m => m.Key, index.KeyComparer) .Cast <IEnumerable <WeightedEntry <TKey> > >() .Select(matches => new WeightedEntry <TKey>( matches.First().Key, weightCombinerForFinalMatches( matches.Select(match => match.Weight).ToImmutableList() ), matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList() )) .ToNonNullImmutableList()); }
/// <summary> /// This GetConsecutiveMatches signature will call GetConsecutiveMatches specifying the DefaultConsecutiveRunsWeightCombiner and DefaultFinalMatchWeightCombiner /// for the weightCombiner arguments (the DefaultConsecutiveRunsWeightCombiner to calculate the combined weight of a run of tokens which should be considered as /// a single match and the DefaultFinalMatchWeightCombiner to combine all of these matches together for each result) /// </summary> public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>(this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker) { return(GetConsecutiveMatches(index, source, tokenBreaker, DefaultConsecutiveRunsWeightCombiner, DefaultFinalMatchWeightCombiner)); }
private static IIndexData <int> GenerateIndexData(NonNullImmutableList <Post> posts, IStringNormaliser sourceStringComparer, ITokenBreaker tokenBreaker) { if (posts == null) { throw new ArgumentNullException(nameof(posts)); } if (sourceStringComparer == null) { throw new ArgumentNullException(nameof(sourceStringComparer)); } if (tokenBreaker == null) { throw new ArgumentNullException(nameof(tokenBreaker)); } // The Post (plain text) content is always the first field since its Content Retriever is first, this means that all source locations for the content // will have an SourceFieldIndex of zero var contentRetrievers = new List <ContentRetriever <Post, int> > { new ContentRetriever <Post, int>( p => new PreBrokenContent <int>(p.Id, p.GetContentAsPlainText()), GetTokenWeightDeterminer(1f, sourceStringComparer) ), new ContentRetriever <Post, int>( p => new PreBrokenContent <int>(p.Id, p.Title), GetTokenWeightDeterminer(5f, sourceStringComparer) ), new ContentRetriever <Post, int>( p => new PreBrokenContent <int>(p.Id, new NonNullOrEmptyStringList(p.Tags.Select(tag => tag.Tag))), GetTokenWeightDeterminer(3f, sourceStringComparer) ) }; return(new IndexGenerator <Post, int>( contentRetrievers.ToNonNullImmutableList(), new DefaultEqualityComparer <int>(), sourceStringComparer, tokenBreaker, weightedValues => weightedValues.Sum(), captureSourceLocations: true, new NullLogger() ).Generate(posts.ToNonNullImmutableList())); }
public WhiteSpaceTokenBreaker(ITokenBreaker optionalWrappedTokenBreaker) { _optionalWrappedTokenBreaker = optionalWrappedTokenBreaker; }