private AutomatedIndexGeneratorFactory <TSource, TKey> .WeightDeterminerGenerator GetDefaultTokenWeightDeterminerGenerator(IStringNormaliser stringNormaliser) { if (stringNormaliser == null) { throw new ArgumentNullException("stringNormaliser"); } // Constructing a HashSet of the normalised versions of the stop words means that looking up whether normalised tokens are stop // words can be a lot faster (as neither the stop words nor the token need to be fed through the normaliser again) var hashSetOfNormalisedStopWords = new HashSet <string>( Constants.GetStopWords("en").Select(word => stringNormaliser.GetNormalisedString(word)) ); return(property => { // Reverse the propertyWeightAppliers so that later values added to the set take precedence (eg. if, for some reason, a x5 weight is // given to a property and then later it's set to be ignored, then we want to ignore it - which this will achieve) var propertyWeightApplier = _propertyWeightAppliers.Reverse().FirstOrDefault(p => p.AppliesTo(property)); if ((propertyWeightApplier != null) && (propertyWeightApplier.WeightMultiplier == 0)) { // A weight multiplier of zero means ignore this property, as does returning null from a WeightDeterminerGenerator call return null; } var weightMultiplier = (propertyWeightApplier != null) ? propertyWeightApplier.WeightMultiplier : 1; return normalisedToken => weightMultiplier * (hashSetOfNormalisedStopWords.Contains(normalisedToken) ? 0.01f : 1f); }); }
/// <summary> /// This will return true if the specified key was found and will set the value output parameter to the corresponding value. If it return false then the /// value output parameter should not be considered to be defined. /// </summary> public bool TryGetValue(string key, out TValue value) { if (key == null) { throw new ArgumentNullException("key"); } var normalisedKey = _keyNormaliser.GetNormalisedString(key); if (normalisedKey != "") { var nodeIndex = 0; var keyIndex = 0; while (true) { if (_nodes[nodeIndex].Character == normalisedKey[keyIndex]) { keyIndex++; if (keyIndex == normalisedKey.Length) { if (_nodes[nodeIndex].IsKey) { value = _values[_nodes[nodeIndex].ValueIndex]; return(true); } break; } if (_nodes[nodeIndex].MiddleChildIndex == -1) { break; } nodeIndex = _nodes[nodeIndex].MiddleChildIndex; } else if (normalisedKey[keyIndex] < _nodes[nodeIndex].Character) { if (_nodes[nodeIndex].LeftChildIndex == -1) { break; } nodeIndex = _nodes[nodeIndex].LeftChildIndex; } else { if (_nodes[nodeIndex].RightChildIndex == -1) { break; } nodeIndex = _nodes[nodeIndex].RightChildIndex; } } } value = default(TValue); return(false); }
private static ContentRetriever <Post, int> .BrokenTokenWeightDeterminer GetTokenWeightDeterminer(float multiplier, IStringNormaliser sourceStringComparer) { if (multiplier <= 0) { throw new ArgumentOutOfRangeException(nameof(multiplier), "must be greater than zero"); } if (sourceStringComparer == null) { throw new ArgumentNullException(nameof(sourceStringComparer)); } // Constructing a HashSet of the normalised versions of the stop words means that looking up whether normalised tokens are stop // words can be a lot faster (as neither the stop words nor the token need to be fed through the normaliser again) var hashSetOfNormalisedStopWords = new HashSet <string>( FullTextIndexer.Core.Constants.GetStopWords("en").Select(word => sourceStringComparer.GetNormalisedString(word)) ); return(normalisedToken => multiplier * (hashSetOfNormalisedStopWords.Contains(normalisedToken) ? 0.01f : 1f)); }
/// <summary> /// This will never return null. It will throw an exception for null input. /// </summary> public IndexData <TKey> Generate(NonNullImmutableList <TSource> data) { if (data == null) { throw new ArgumentNullException("data"); } // Build up data about token occurences in the data // - We'll be using the token values in the indexContent dictionary after they have been normalised by the sourceStringComparer, this means that we // don't need to specify the sourceStringComparer as the comparer for indexContent which may save some work depending upon the implementation of // the sourceStringComparer var timer = new Stopwatch(); timer.Start(); var indexContent = new Dictionary <string, Dictionary <TKey, List <WeightedEntry <TKey> > > >(); var timeElapsedForNextUpdateMessage = TimeSpan.FromSeconds(5); for (var index = 0; index < data.Count; index++) { var entry = data[index]; var sourceFieldIndex = 0; foreach (var contentRetriever in _contentRetrievers) { PreBrokenContent <TKey> preBrokenContent; try { preBrokenContent = contentRetriever.InitialContentRetriever(entry); } catch (Exception e) { throw new Exception("contentRetriever.InitialContentRetriever threw exception", e); } if (preBrokenContent == null) { throw new Exception("contentRetriever.InitialContentRetriever returned null - this is invalid"); } if (timer.Elapsed >= timeElapsedForNextUpdateMessage) { _logger.LogIgnoringAnyError(LogLevel.Debug, () => String.Format("Work completed: {0}%", ((index * 100f) / (float)data.Count).ToString("0.000"))); timeElapsedForNextUpdateMessage = timer.Elapsed.Add(TimeSpan.FromSeconds(5)); } foreach (var contentSection in preBrokenContent.Content) { foreach (var weightedTokenMatch in _tokenBreaker.Break(contentSection)) { // Strings that are reduced to "" by the normaliser have no meaning (they can't be searched for) and should be ignored var normalisedToken = _sourceStringComparer.GetNormalisedString(weightedTokenMatch.Token); if (normalisedToken == "") { continue; } Dictionary <TKey, List <WeightedEntry <TKey> > > allDataForToken; if (!indexContent.TryGetValue(normalisedToken, out allDataForToken)) { allDataForToken = new Dictionary <TKey, List <WeightedEntry <TKey> > >(_dataKeyComparer); indexContent.Add(normalisedToken, allDataForToken); } if (!allDataForToken.ContainsKey(preBrokenContent.Key)) { allDataForToken.Add(preBrokenContent.Key, new List <WeightedEntry <TKey> >()); } // Each WeightedEntry requires a sourceLocation set which specifies a location in a content field - the SourceLocation // returned by the Token Breaker has the token index, start point and length but it needs a distinct field index. The // index of the current Content Retriever will do fine. var matchWeight = contentRetriever.TokenWeightDeterminer(normalisedToken) * weightedTokenMatch.WeightMultiplier; allDataForToken[preBrokenContent.Key].Add( new WeightedEntry <TKey>( preBrokenContent.Key, matchWeight, _captureSourceLocations ? (new[] { new SourceFieldLocation( sourceFieldIndex, weightedTokenMatch.SourceLocation.TokenIndex, weightedTokenMatch.SourceLocation.SourceIndex, weightedTokenMatch.SourceLocation.SourceTokenLength, matchWeight ) }).ToNonNullImmutableList() : null ) ); } // This has to be incremented for each content section successfully extracted from the source data, to ensure that each // section gets a unique SourceFieldLocation.SourceFieldIndex assigned to it sourceFieldIndex++; } if (sourceFieldIndex == 0) { // The sourceFieldIndex should move at least once for the first content retriever (even if it didn't manage to extract any content using // it) so that the index generator can be configured such that all source locations with SourceFieldIndex zero can be guaranteed to have // come from a particular property (if it retrieves no content then there will be no source locations instances with a SourceFieldIndex // value of zero). This can be used for search term highlighting. Only the first content retriever can be supported in this manner since // if the first content retriever returns varying numbers of content sections then all bets are off for synchronising field index values // for the subsequent retrievers. sourceFieldIndex++; } } } _logger.LogIgnoringAnyError( LogLevel.Debug, () => String.Format("Time taken to generate initial token data: {0}ms ({1:0.00}ms per item)", timer.ElapsedMilliseconds, (float)timer.ElapsedMilliseconds / (float)data.Count) ); timer.Restart(); // Combine entries where Token and Key values match (as with the indexContent dictionary, we don't need to specify the sourceStringComparer as the // combinedContent dictionary comparer as all values were stored in indexContent after being normalised - this may save some work depending upon // the sourceStringComparer implementation) var combinedContent = new Dictionary <string, List <WeightedEntry <TKey> > >(); foreach (var token in indexContent.Keys) { combinedContent.Add(token, new List <WeightedEntry <TKey> >()); foreach (var key in indexContent[token].Keys) { var matches = indexContent[token][key]; combinedContent[token].Add( new WeightedEntry <TKey>( key, _weightedEntryCombiner(matches.Select(m => m.Weight).ToImmutableList()), matches.Any(m => m.SourceLocationsIfRecorded == null) ? null : matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList() ) ); } } _logger.LogIgnoringAnyError( LogLevel.Debug, () => String.Format("Time taken to combine token data sets: {0}ms ({1:0.00}ms per item)", timer.ElapsedMilliseconds, (float)timer.ElapsedMilliseconds / (float)data.Count) ); timer.Restart(); // Translate this into an IndexData instance var indexData = new IndexData <TKey>( new TernarySearchTreeDictionary <NonNullImmutableList <WeightedEntry <TKey> > >( combinedContent.Select(entry => new KeyValuePair <string, NonNullImmutableList <WeightedEntry <TKey> > >(entry.Key, entry.Value.ToNonNullImmutableList())), _sourceStringComparer ), _dataKeyComparer ); _logger.LogIgnoringAnyError( LogLevel.Debug, () => String.Format("Time taken to generate final IndexData: {0}ms ({1:0.00}ms per item)", timer.ElapsedMilliseconds, (float)timer.ElapsedMilliseconds / (float)data.Count) ); return(indexData); }
public TernarySearchTreeStructDictionary(IEnumerable <KeyValuePair <string, TValue> > data, IStringNormaliser keyNormaliser) { if (data == null) { throw new ArgumentNullException("data"); } if (keyNormaliser == null) { throw new ArgumentNullException("keyNormaliser"); } var nodes = new List <Node> { GetUnintialisedNode() }; var values = new List <TValue>(); var keys = new HashSet <string>(keyNormaliser); foreach (var entry in data) { var key = entry.Key; if (key == null) { throw new ArgumentException("Null key encountered in data"); } var normalisedKey = keyNormaliser.GetNormalisedString(key); if (normalisedKey == "") { throw new ArgumentException("key value results in blank string when normalised: " + key); } if (keys.Contains(normalisedKey)) { throw new ArgumentException("key value results in duplicate normalised key:" + key); } keys.Add(key); if (nodes[0].Character == (char)0) { nodes[0] = SetCharacter(nodes[0], normalisedKey[0]); } var nodeIndex = 0; var keyIndex = 0; while (true) { if (nodes[nodeIndex].Character == normalisedKey[keyIndex]) { keyIndex++; if (keyIndex == normalisedKey.Length) { var newValueIndex = values.Count; values.Add(entry.Value); nodes[nodeIndex] = SetValueIndex(nodes[nodeIndex], newValueIndex); break; } if (nodes[nodeIndex].MiddleChildIndex == -1) { var newNode = SetCharacter(GetUnintialisedNode(), normalisedKey[keyIndex]); var newNodeIndex = nodes.Count; nodes.Add(newNode); nodes[nodeIndex] = SetMiddleChildIndex(nodes[nodeIndex], newNodeIndex); nodeIndex = newNodeIndex; } else { nodeIndex = nodes[nodeIndex].MiddleChildIndex; } continue; } else if (normalisedKey[keyIndex] < nodes[nodeIndex].Character) { if (nodes[nodeIndex].LeftChildIndex == -1) { var newNode = SetCharacter(GetUnintialisedNode(), normalisedKey[keyIndex]); var newNodeIndex = nodes.Count; nodes.Add(newNode); nodes[nodeIndex] = SetLeftChildIndex(nodes[nodeIndex], newNodeIndex); nodeIndex = newNodeIndex; } else { nodeIndex = nodes[nodeIndex].LeftChildIndex; } } else { if (nodes[nodeIndex].RightChildIndex == -1) { var newNode = SetCharacter(GetUnintialisedNode(), normalisedKey[keyIndex]); var newNodeIndex = nodes.Count; nodes.Add(newNode); nodes[nodeIndex] = SetRightChildIndex(nodes[nodeIndex], newNodeIndex); nodeIndex = newNodeIndex; } else { nodeIndex = nodes[nodeIndex].RightChildIndex; } } } } _nodes = nodes.ToArray(); _values = values.ToArray(); _keyNormaliser = keyNormaliser; _keys = keys.ToList().AsReadOnly(); }
private static Node Add(Node root, IStringNormaliser keyNormaliser, IEnumerable <KeyValuePair <string, TValue> > data, Combiner combine) { if (keyNormaliser == null) { throw new ArgumentNullException("keyNormaliser"); } if (data == null) { throw new ArgumentNullException("keys"); } if (combine == null) { throw new ArgumentNullException(nameof(combine)); } if (!data.Any()) { return(root); } if (root != null) { root = root.Clone(); } foreach (var entry in data) { var key = entry.Key; if (key == null) { throw new ArgumentException("Null key encountered in data"); } var normalisedKey = keyNormaliser.GetNormalisedString(key); if (normalisedKey == "") { throw new ArgumentException("key value results in blank string when normalised: " + key); } if (root == null) { root = new Node() { Character = normalisedKey[0] } } ; var node = root; var index = 0; while (true) { if (node.Character == normalisedKey[index]) { index++; if (index == normalisedKey.Length) { node.Value = ((node.Value != null) && (entry.Value != null)) ? combine(node.Value, entry.Value) : ((node.Value != null) ? node.Value : entry.Value); node.Key = (node.Value == null) ? null : normalisedKey; // If we ended up with a null Value then the Combiner may have removed it, in which case we should set the Key to null as well break; } if (node.MiddleChild == null) { node.MiddleChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.MiddleChild; } else if (normalisedKey[index] < node.Character) { if (node.LeftChild == null) { node.LeftChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.LeftChild; } else { if (node.RightChild == null) { node.RightChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.RightChild; } } } return(root); }
/// <summary> /// Combine additional data with an existing root's content, returning a new Node (unless zero data entries were specified, in which case the original /// reference will be returned). This will throw an exception for a null keyNormalised or data (or if any keys in the data return null, empty string /// or duplicates when run through the keyNormaliser). If a null root is specified then a new root will be generated. /// </summary> private static Node Add(Node root, IStringNormaliser keyNormaliser, IEnumerable <KeyValuePair <string, TValue> > data) { if (keyNormaliser == null) { throw new ArgumentNullException("keyNormaliser"); } if (data == null) { throw new ArgumentNullException("keys"); } if (!data.Any()) { return(root); } if (root != null) { root = root.Clone(); } foreach (var entry in data) { var key = entry.Key; if (key == null) { throw new ArgumentException("Null key encountered in data"); } var normalisedKey = keyNormaliser.GetNormalisedString(key); if (normalisedKey == "") { throw new ArgumentException("key value results in blank string when normalised: " + key); } if (root == null) { root = new Node() { Character = normalisedKey[0] } } ; var node = root; var index = 0; while (true) { if (node.Character == normalisedKey[index]) { index++; if (index == normalisedKey.Length) { node.Key = normalisedKey; node.Value = entry.Value; break; } if (node.MiddleChild == null) { node.MiddleChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.MiddleChild; } else if (normalisedKey[index] < node.Character) { if (node.LeftChild == null) { node.LeftChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.LeftChild; } else { if (node.RightChild == null) { node.RightChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.RightChild; } } } return(root); }