/// <summary> /// This will never return null. It will throw an exception for null input. /// </summary> public NonNullImmutableList <WeightAdjustingToken> Break(string value) { if (value == null) { throw new ArgumentNullException("value"); } var tokens = new List <WeightAdjustingToken>(); foreach (var weightAdjustingToken in _tokenBreaker.Break(value)) { // Add the unaltered token to the list tokens.Add(weightAdjustingToken); // Generate partial match data for this token foreach (var weightAdjustingSubToken in GetTokensForPartialMatchGeneration(weightAdjustingToken)) { foreach (var subTokenMatchVariation in GenerateAllMatchVariations(weightAdjustingSubToken)) { // If this current variation is the unaltered token value returned by the core tokenBreaker then ignore it as it's already // been added to the list if (subTokenMatchVariation.Token == weightAdjustingToken.Token) { continue; } // Get the weight adjustment for the match variation; exclude it if zero or combine it with the weightAdjustingSubToken's // WeightMultiplier if greater than zero (less than zero is invalid and will cause an exception to be thrown) var partialMatchWeightMultiplier = _partialMatchWeightDeterminer(weightAdjustingToken.Token, subTokenMatchVariation.Token); if (partialMatchWeightMultiplier < 0) { throw new Exception("partialMatchWeightMultiplier returned negative value"); } else if (partialMatchWeightMultiplier == 0) { continue; } tokens.Add(new WeightAdjustingToken( subTokenMatchVariation.Token, weightAdjustingToken.WeightMultiplier * weightAdjustingSubToken.WeightMultiplier * partialMatchWeightMultiplier, new SourceLocation( subTokenMatchVariation.SourceLocation.TokenIndex, subTokenMatchVariation.SourceLocation.SourceIndex, subTokenMatchVariation.SourceLocation.SourceTokenLength ) )); } } } ; return(tokens.ToNonNullImmutableList()); }
/// <summary> /// This will never return null. It will throw an exception for null input. /// </summary> public NonNullImmutableList <WeightAdjustingToken> Break(string value) { if (value == null) { throw new ArgumentNullException("value"); } foreach (var charToReplace in _charsToTreatAsWhitespace) { value = value.Replace(charToReplace, ' '); } return(_tokenBreaker.Break(value)); }
/// <summary> /// This will never return null. It will throw an exception for null input. /// </summary> public NonNullImmutableList <WeightAdjustingToken> Break(string value) { if (value == null) { throw new ArgumentNullException("value"); } var initialTokens = _tokenBreaker.Break(value); var extendedTokens = NonNullImmutableList <WeightAdjustingToken> .Empty; for (var combineLength = 1; combineLength <= _maxNumberOfTokens; combineLength++) { for (var index = 0; index < initialTokens.Count - (combineLength - 1); index++) { var tokensToCombine = initialTokens.Skip(index).Take(combineLength).ToArray(); var weightMultiplier = _weightMultiplierDeterminer(tokensToCombine.Select(t => t.WeightMultiplier).ToImmutableList()); if ((weightMultiplier <= 0) || (weightMultiplier > 1)) { throw new Exception("Specified WeightMultiplierDeterminer return an invalid value: " + weightMultiplier); } // The sourceTokenLength is determined by taking the end point of the last token and subtracting the start point of the first // token. The length couldn't be the combined lenth of each token since any breaking characters between tokens would not be // taken into account. The TokenIndex of the first token will be used for the new WeightAdjustingToken instance - this may // not be strictly accurate but since there are now overlapping tokens, it's probably the best that can be done. var firstToken = tokensToCombine[0]; var lastToken = tokensToCombine[tokensToCombine.Length - 1]; extendedTokens = extendedTokens.Add( new WeightAdjustingToken( string.Join(" ", tokensToCombine.Select(t => t.Token)), weightMultiplier, new SourceLocation( firstToken.SourceLocation.TokenIndex, firstToken.SourceLocation.SourceIndex, (lastToken.SourceLocation.SourceIndex + lastToken.SourceLocation.SourceTokenLength) - firstToken.SourceLocation.SourceIndex ) ) ); } } return(extendedTokens); }
/// <summary> /// This will never return null. It will throw an exception for null input. /// </summary> public IndexData <TKey> Generate(NonNullImmutableList <TSource> data) { if (data == null) { throw new ArgumentNullException("data"); } // Build up data about token occurences in the data // - We'll be using the token values in the indexContent dictionary after they have been normalised by the sourceStringComparer, this means that we // don't need to specify the sourceStringComparer as the comparer for indexContent which may save some work depending upon the implementation of // the sourceStringComparer var timer = new Stopwatch(); timer.Start(); var indexContent = new Dictionary <string, Dictionary <TKey, List <WeightedEntry <TKey> > > >(); var timeElapsedForNextUpdateMessage = TimeSpan.FromSeconds(5); for (var index = 0; index < data.Count; index++) { var entry = data[index]; var sourceFieldIndex = 0; foreach (var contentRetriever in _contentRetrievers) { PreBrokenContent <TKey> preBrokenContent; try { preBrokenContent = contentRetriever.InitialContentRetriever(entry); } catch (Exception e) { throw new Exception("contentRetriever.InitialContentRetriever threw exception", e); } if (preBrokenContent == null) { throw new Exception("contentRetriever.InitialContentRetriever returned null - this is invalid"); } if (timer.Elapsed >= timeElapsedForNextUpdateMessage) { _logger.LogIgnoringAnyError(LogLevel.Debug, () => String.Format("Work completed: {0}%", ((index * 100f) / (float)data.Count).ToString("0.000"))); timeElapsedForNextUpdateMessage = timer.Elapsed.Add(TimeSpan.FromSeconds(5)); } foreach (var contentSection in preBrokenContent.Content) { foreach (var weightedTokenMatch in _tokenBreaker.Break(contentSection)) { // Strings that are reduced to "" by the normaliser have no meaning (they can't be searched for) and should be ignored var normalisedToken = _sourceStringComparer.GetNormalisedString(weightedTokenMatch.Token); if (normalisedToken == "") { continue; } Dictionary <TKey, List <WeightedEntry <TKey> > > allDataForToken; if (!indexContent.TryGetValue(normalisedToken, out allDataForToken)) { allDataForToken = new Dictionary <TKey, List <WeightedEntry <TKey> > >(_dataKeyComparer); indexContent.Add(normalisedToken, allDataForToken); } if (!allDataForToken.ContainsKey(preBrokenContent.Key)) { allDataForToken.Add(preBrokenContent.Key, new List <WeightedEntry <TKey> >()); } // Each WeightedEntry requires a sourceLocation set which specifies a location in a content field - the SourceLocation // returned by the Token Breaker has the token index, start point and length but it needs a distinct field index. The // index of the current Content Retriever will do fine. var matchWeight = contentRetriever.TokenWeightDeterminer(normalisedToken) * weightedTokenMatch.WeightMultiplier; allDataForToken[preBrokenContent.Key].Add( new WeightedEntry <TKey>( preBrokenContent.Key, matchWeight, _captureSourceLocations ? (new[] { new SourceFieldLocation( sourceFieldIndex, weightedTokenMatch.SourceLocation.TokenIndex, weightedTokenMatch.SourceLocation.SourceIndex, weightedTokenMatch.SourceLocation.SourceTokenLength, matchWeight ) }).ToNonNullImmutableList() : null ) ); } // This has to be incremented for each content section successfully extracted from the source data, to ensure that each // section gets a unique SourceFieldLocation.SourceFieldIndex assigned to it sourceFieldIndex++; } if (sourceFieldIndex == 0) { // The sourceFieldIndex should move at least once for the first content retriever (even if it didn't manage to extract any content using // it) so that the index generator can be configured such that all source locations with SourceFieldIndex zero can be guaranteed to have // come from a particular property (if it retrieves no content then there will be no source locations instances with a SourceFieldIndex // value of zero). This can be used for search term highlighting. Only the first content retriever can be supported in this manner since // if the first content retriever returns varying numbers of content sections then all bets are off for synchronising field index values // for the subsequent retrievers. sourceFieldIndex++; } } } _logger.LogIgnoringAnyError( LogLevel.Debug, () => String.Format("Time taken to generate initial token data: {0}ms ({1:0.00}ms per item)", timer.ElapsedMilliseconds, (float)timer.ElapsedMilliseconds / (float)data.Count) ); timer.Restart(); // Combine entries where Token and Key values match (as with the indexContent dictionary, we don't need to specify the sourceStringComparer as the // combinedContent dictionary comparer as all values were stored in indexContent after being normalised - this may save some work depending upon // the sourceStringComparer implementation) var combinedContent = new Dictionary <string, List <WeightedEntry <TKey> > >(); foreach (var token in indexContent.Keys) { combinedContent.Add(token, new List <WeightedEntry <TKey> >()); foreach (var key in indexContent[token].Keys) { var matches = indexContent[token][key]; combinedContent[token].Add( new WeightedEntry <TKey>( key, _weightedEntryCombiner(matches.Select(m => m.Weight).ToImmutableList()), matches.Any(m => m.SourceLocationsIfRecorded == null) ? null : matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList() ) ); } } _logger.LogIgnoringAnyError( LogLevel.Debug, () => String.Format("Time taken to combine token data sets: {0}ms ({1:0.00}ms per item)", timer.ElapsedMilliseconds, (float)timer.ElapsedMilliseconds / (float)data.Count) ); timer.Restart(); // Translate this into an IndexData instance var indexData = new IndexData <TKey>( new TernarySearchTreeDictionary <NonNullImmutableList <WeightedEntry <TKey> > >( combinedContent.Select(entry => new KeyValuePair <string, NonNullImmutableList <WeightedEntry <TKey> > >(entry.Key, entry.Value.ToNonNullImmutableList())), _sourceStringComparer ), _dataKeyComparer ); _logger.LogIgnoringAnyError( LogLevel.Debug, () => String.Format("Time taken to generate final IndexData: {0}ms ({1:0.00}ms per item)", timer.ElapsedMilliseconds, (float)timer.ElapsedMilliseconds / (float)data.Count) ); return(indexData); }
/// <summary> /// This will break down a source search term into words (according to the logic of the specified token breaker) and then return matches where the words were found in a run in a /// content section. Unlike GetPartialMatches it is not possible for an entry to be considered a match because it contains all of the terms in its content, the terms must be /// present in one content field, together, in the order in which they are present in the search term. This allows for similar behaviour to that intended for the /// ConsecutiveTokenCombiningTokenBreaker, but this offers greater performance (constructing a TernarySearchTreeDictionary to back an IndexData instance can be expensive on /// processing time to generate, and disk / memory space to store, the runs of tokens). This also has the benefit that there is no cap on the number of tokens that can be /// matched consecutively (a limit on this had to be decided at index generation time when using the ConsecutiveTokenCombiningTokenBreaker). There are two sets of weight /// combining calculations required; the first (handled by the weightCombinerForConsecutiveRuns) determines a weight for run of consecutive tokens - each run is considered /// a single match, effectively. Each call to the first weight comber will have as many weights to combine as there are search terms, so if the "source" value is broken /// down into three words by the tokenBreaker then the weightCombinerForConsecutiveRuns will always be called with sets of three weights. The second weight combination /// is performed when multiple matches for a particular result must be combined to give a final match weight for that result. /// /// Note: This requires the index to have been built with source location data recorded - if the index's SourceLocationsAvailable property returns false then an ArgumentException /// will be thrown. /// </summary> public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>( this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightCombinerForConsecutiveRuns, IndexGenerator.WeightedEntryCombiner weightCombinerForFinalMatches) { if (index == null) { throw new ArgumentNullException("index"); } if (source == null) { throw new ArgumentNullException("source"); } if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (weightCombinerForConsecutiveRuns == null) { throw new ArgumentNullException("weightCombinerForConsecutiveRuns"); } if (weightCombinerForFinalMatches == null) { throw new ArgumentNullException("weightCombinerForFinalMatches"); } if (!index.SourceLocationsAvailable) { throw new ArgumentException($"The {nameof(index)} must include source location data in order to use identify Consecutive token matches"); } // If the token breaker won't actually translate the source value into multiple words then we can avoid all of the below work and just call index.GetMatches directly var weightAdjustedTokens = tokenBreaker.Break(source); if (weightAdjustedTokens.Count == 1) { return(index.GetMatches(source)); } // The index of this list will correspond to the index of the broken-down search terms var matchesForSearchTerms = new List <WeightedEntry <TKey>[]>(); foreach (var weightAdjustedToken in weightAdjustedTokens) { matchesForSearchTerms.Add( index.GetMatches(weightAdjustedToken.Token).Select(w => new WeightedEntry <TKey>( w.Key, w.Weight * weightAdjustedToken.WeightMultiplier, w.SourceLocationsIfRecorded )).ToArray() ); } // For each match of the first search term, try to identify a run of token matches for the same key and source field. Any such runs will be recorded in the consecutiveMatches // list - these represent content segments that match the entirety of the search term (the "source" argument). var consecutiveMatches = new List <WeightedEntry <TKey> >(); var searchTerms = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token)); foreach (var firstTermMatch in matchesForSearchTerms.First().SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations(m))) { var matchesForEntireTerm = NonNullImmutableList <WeightedEntry <TKey> > .Empty; matchesForEntireTerm = matchesForEntireTerm.Add(firstTermMatch); for (var termIndex = 1; termIndex < weightAdjustedTokens.Count; termIndex++) { // Note: SourceLocationsIfRecorded should never be null because we checked that the index reported that SourceLocationsAvailable was true (so not checking for null // source locations here) var nTermMatch = matchesForSearchTerms[termIndex] .SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations <TKey>(m)) .FirstOrDefault(m => index.KeyComparer.Equals(m.Key, firstTermMatch.Key) && (m.SourceLocationsIfRecorded.First().SourceFieldIndex == firstTermMatch.SourceLocationsIfRecorded.First().SourceFieldIndex) && (m.SourceLocationsIfRecorded.First().TokenIndex == firstTermMatch.SourceLocationsIfRecorded.First().TokenIndex + termIndex) ); if (nTermMatch == null) { break; } matchesForEntireTerm = matchesForEntireTerm.Add(nTermMatch); } if (matchesForEntireTerm.Count < weightAdjustedTokens.Count) { // If we didn't manage to get a full set of search terms then this isn't a full match continue; } // Combine the WeightedEntry instances that represent a run of individual matches (one for each word in the "source" argument) into a single WeightedEntry that represents // the entirety of the search term (each of the matchesForEntireTerm WeightedEntry instances will have only a single Source Location since the match data was split up // above by calling BreakWeightedEntryIntoIndividualSourceLocations before trying to find the consecutive matches). See notes above about not checking whether // SourceLocationsIfRecorded is null (it shouldn't be because we index.SourceLocationsAvailable at the top of this method) var sourceLocationOfFirstTerm = matchesForEntireTerm.First().SourceLocationsIfRecorded.Single(); var sourceLocationOfLastTerm = matchesForEntireTerm.Last().SourceLocationsIfRecorded.Single(); var matchWeightForConsecutiveRunEntry = weightCombinerForConsecutiveRuns( matchesForEntireTerm.Select(m => m.Weight).ToImmutableList() ); consecutiveMatches.Add( new WeightedEntry <TKey>( matchesForEntireTerm.First().Key, matchWeightForConsecutiveRunEntry, new NonNullImmutableList <SourceFieldLocation>(new[] { // Since we're creating a new SourceFieldLocation instance that is derived from a run of multiple tokens, the TokenIndex is going to be an approximation - // taking the TokenIndex from the first search term probably makes the most sense. The SourceIndex and SourceTokenLength will be taken such that the entire // run is covered (from the start of the first search term to the end of the last). Since this is the only Source Location instance for the WeightedEntry, // its MatchWeightContribution value is equal to the WeightedEntry's Weight. new SourceFieldLocation( sourceLocationOfFirstTerm.SourceFieldIndex, sourceLocationOfFirstTerm.TokenIndex, sourceLocationOfFirstTerm.SourceIndex, (sourceLocationOfLastTerm.SourceIndex + sourceLocationOfLastTerm.SourceTokenLength) - sourceLocationOfFirstTerm.SourceIndex, matchWeightForConsecutiveRunEntry ) }) ) ); } // The matches need grouping by key before returning return(consecutiveMatches .GroupBy(m => m.Key, index.KeyComparer) .Cast <IEnumerable <WeightedEntry <TKey> > >() .Select(matches => new WeightedEntry <TKey>( matches.First().Key, weightCombinerForFinalMatches( matches.Select(match => match.Weight).ToImmutableList() ), matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList() )) .ToNonNullImmutableList()); }
/// <summary> /// This will never return null. It will throw an exception for null input. /// </summary> public NonNullImmutableList <WeightAdjustingToken> Break(string value) { if (value == null) { throw new ArgumentNullException("value"); } IEnumerable <WeightAdjustingToken> tokensToBreak; if (_optionalWrappedTokenBreaker == null) { tokensToBreak = new[] { new WeightAdjustingToken(value, 1, new SourceLocation(0, 0, value.Length)) } } ; else { tokensToBreak = _optionalWrappedTokenBreaker.Break(value); } var tokens = new List <WeightAdjustingToken>(); foreach (var weightAdjustingToken in tokensToBreak) { var buffer = new StringBuilder(); var bufferStartIndex = 0; for (var index = 0; index < weightAdjustingToken.Token.Length; index++) { if (char.IsWhiteSpace(weightAdjustingToken.Token[index])) { if (buffer.Length > 0) { var bufferContents = buffer.ToString(); tokens.Add(new WeightAdjustingToken( bufferContents, weightAdjustingToken.WeightMultiplier, new SourceLocation( tokens.Count, weightAdjustingToken.SourceLocation.SourceIndex + bufferStartIndex, bufferContents.Length ) )); buffer.Clear(); } bufferStartIndex = index + 1; continue; } buffer.Append(weightAdjustingToken.Token[index]); } if (buffer.Length > 0) { var bufferContents = buffer.ToString(); tokens.Add(new WeightAdjustingToken( bufferContents, weightAdjustingToken.WeightMultiplier, new SourceLocation( tokens.Count, weightAdjustingToken.SourceLocation.SourceIndex + bufferStartIndex, bufferContents.Length ) )); buffer.Clear(); } } ; return(tokens.ToNonNullImmutableList()); }
/// <summary> /// This will break a given source string and return results based upon the combination of partial matches (so results that only match part of the source string may be included /// in the returned data). The token breaker and the match combiner must be specified by the caller - if the match combiner returns zero then the result will not be included in /// the final data. To require that all tokens in the source content be present for any returned results, the following matchCombiner could be specified: /// (tokenMatches, allTokens) => (tokenMatches.Count < allTokens.Count) ? 0 : tokenMatches.Sum(m => m.Weight) /// </summary> public static NonNullImmutableList <WeightedEntry <TKey> > GetPartialMatches <TKey>( this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker, WeightCombiner weightCombiner) { if (index == null) { throw new ArgumentNullException("index"); } if (source == null) { throw new ArgumentNullException("source"); } if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (weightCombiner == null) { throw new ArgumentNullException("weightCombiner"); } // Break down the "source" search term and find matches for each token // - Each match maintains the weight multiplier applied to the string segment from the token breaker // - The Source Locations are annotated with additional data; the source segment string and what token index that is (so if the "source" value is broken into three, then // each Source Location will have a SearchTerm property whose TokenIndex will be between 0 and 2, inclusive). This allows for a weightCombiner to be specified that // ensures that every token that was extract from the source value can be matched against a given result, if so desired. var matches = new List <Tuple <WeightedEntry <TKey>, SearchTermDetails> >(); var weightAdjustedTokens = tokenBreaker.Break(source); for (var tokenIndex = 0; tokenIndex < weightAdjustedTokens.Count; tokenIndex++) { var weightAdjustedToken = weightAdjustedTokens[tokenIndex]; matches.AddRange( index .GetMatches(weightAdjustedToken.Token) .Select(match => Tuple.Create(match, new SearchTermDetails(tokenIndex, weightAdjustedToken.Token))) ); } // Combine per-search-term results, grouping by result key and calculating the match weight for each token using the specified weightCombiner (this may also be // used to filter out results; if a match weight of zero is returned then the match will be ignored - this may used to filter out results that only match two // out of three of the search terms, for example) var finalResults = NonNullImmutableList <WeightedEntry <TKey> > .Empty; var searchTerms = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token)); foreach (var matchesGroupedByKey in matches.GroupBy(m => m.Item1.Key, index.KeyComparer).Cast <IEnumerable <Tuple <WeightedEntry <TKey>, SearchTermDetails> > >()) { var combinedWeight = weightCombiner( matchesGroupedByKey .Select(m => new MatchWeightWithSourceFieldLocations( m.Item1.Weight, m.Item2, m.Item1.SourceLocationsIfRecorded )).ToNonNullImmutableList(), searchTerms ); if (combinedWeight < 0) { throw new ArgumentException("weightCombiner returned a negative value - invalid"); } else if (combinedWeight > 0) { finalResults = finalResults.Add( new WeightedEntry <TKey>( matchesGroupedByKey.First().Item1.Key, combinedWeight, matchesGroupedByKey.Any(m => m.Item1.SourceLocationsIfRecorded == null) ? null : matchesGroupedByKey.SelectMany(m => m.Item1.SourceLocationsIfRecorded).ToNonNullImmutableList() ) ); } } return(finalResults); }