public EnglishPluralityStringNormaliser(IEnumerable <PluralEntry> plurals, IStringNormaliser optionalPreNormaliser, PreNormaliserWorkOptions preNormaliserWork) { if (plurals == null) { throw new ArgumentNullException("pluralEntries"); } var allPreNormaliserOptions = (PreNormaliserWorkOptions)0; foreach (PreNormaliserWorkOptions option in Enum.GetValues(typeof(PreNormaliserWorkOptions))) { allPreNormaliserOptions = allPreNormaliserOptions | option; } if ((preNormaliserWork & allPreNormaliserOptions) != preNormaliserWork) { throw new ArgumentOutOfRangeException("preNormaliserWork"); } var pluralsTidied = new List <PluralEntry>(); foreach (var plural in plurals) { if (plural == null) { throw new ArgumentException("Null reference encountered in plurals set"); } pluralsTidied.Add(plural); } // Although we don't need the plurals once the normaliser has been generated in normal operation, if the instance is to be serialised then we need to record // them so that the normalier can be re-generated at deserialisation (as the normaliser that is generated can not be serialised - see GetObjectData) _plurals = pluralsTidied; _normaliser = GenerateNormaliser(); _optionalPreNormaliser = optionalPreNormaliser; _preNormaliserWork = preNormaliserWork; }
public TextFileBasedTSTWordLookup(FileInfo file, IStringNormaliser stringNormaliser) { _file = file ?? throw new ArgumentNullException(nameof(file)); _stringNormaliser = stringNormaliser ?? throw new ArgumentNullException(nameof(stringNormaliser)); _data = new Lazy <TernarySearchTreeDictionary <bool> >( GenerateLookup, true // isThreadSafe ); }
private TernarySearchTreeDictionary(Node rootIfAny, IStringNormaliser keyNormaliser) { if (keyNormaliser == null) { throw new ArgumentNullException("keyNormaliser"); } _root = rootIfAny; _keyNormaliser = keyNormaliser; }
public AuthUserBaseRepository(IOptions <MongoDbSettings> dbSettings, IStringNormaliser normaliser) { var client = new MongoClient(dbSettings.Value.ConnectionString); if (client != null) { _database = client.GetDatabase(dbSettings.Value.Database); } if (_database != null) { _users = _database.GetCollection <TAuthUser>("AuthUsers"); } _normaliser = normaliser; }
public TernarySearchTreeDictionary(IEnumerable <KeyValuePair <string, TValue> > data, IStringNormaliser keyNormaliser) { if (data == null) { throw new ArgumentNullException("data"); } if (keyNormaliser == null) { throw new ArgumentNullException("keyNormaliser"); } _root = Add(null, keyNormaliser, data); _keyNormaliser = keyNormaliser; }
public IndexGenerator( NonNullImmutableList <ContentRetriever <TSource, TKey> > contentRetrievers, IEqualityComparer <TKey> dataKeyComparer, IStringNormaliser sourceStringComparer, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightedEntryCombiner, bool captureSourceLocations, ILogger logger) { _contentRetrievers = contentRetrievers ?? throw new ArgumentNullException(nameof(contentRetrievers)); _dataKeyComparer = dataKeyComparer ?? throw new ArgumentNullException(nameof(dataKeyComparer)); _sourceStringComparer = sourceStringComparer ?? throw new ArgumentNullException(nameof(sourceStringComparer)); _tokenBreaker = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker)); _weightedEntryCombiner = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner)); _captureSourceLocations = captureSourceLocations; _logger = logger ?? throw new ArgumentNullException(nameof(logger)); }
public PostIndexGenerator(ITokenBreaker tokenBreaker, IStringNormaliser sourceStringComparer, ILogger logger) { if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (sourceStringComparer == null) { throw new ArgumentNullException("sourceStringComparer"); } if (logger == null) { throw new ArgumentNullException("logger"); } _tokenBreaker = tokenBreaker; _sourceStringComparer = sourceStringComparer; _stopWordLookup = new HashSet <string>(Constants.GetStopWords("en"), _sourceStringComparer); // TODO: Explain (if it helps) _logger = logger; }
public AutomatedIndexGeneratorFactory( Func <TSource, TKey> keyRetriever, IEqualityComparer <TKey> keyComparer, IStringNormaliser stringNormaliser, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightedEntryCombiner, WeightDeterminerGenerator brokenTokenWeightDeterminerGenerator, PropertyInfo optionalPropertyForFirstContentRetriever, bool captureSourceLocations, ILogger logger) { _keyRetriever = keyRetriever ?? throw new ArgumentNullException(nameof(keyRetriever)); _keyComparer = keyComparer ?? throw new ArgumentNullException(nameof(keyComparer)); _stringNormaliser = stringNormaliser ?? throw new ArgumentNullException(nameof(stringNormaliser)); _tokenBreaker = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker)); _weightedEntryCombiner = weightedEntryCombiner ?? throw new ArgumentNullException(nameof(weightedEntryCombiner)); _brokenTokenWeightDeterminerGenerator = brokenTokenWeightDeterminerGenerator ?? throw new ArgumentNullException(nameof(brokenTokenWeightDeterminerGenerator)); _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever; _captureSourceLocations = captureSourceLocations; _logger = logger ?? throw new ArgumentNullException(nameof(logger)); }
private AutomatedIndexGeneratorFactoryBuilder( Func <TSource, TKey> keyRetrieverOverride, IEqualityComparer <TKey> keyComparerOverride, IStringNormaliser stringNormaliserOverride, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightedEntryCombinerOverride, NonNullImmutableList <IModifyMatchWeights> propertyWeightAppliers, AutomatedIndexGeneratorFactory <TSource, TKey> .WeightDeterminerGenerator tokenWeightDeterminerGeneratorOverride, PropertyInfo optionalPropertyForFirstContentRetriever, bool captureSourceLocations, ILogger loggerOverride) { _keyRetrieverOverride = keyRetrieverOverride; _keyComparerOverride = keyComparerOverride; _stringNormaliserOverride = stringNormaliserOverride; _tokenBreaker = tokenBreaker ?? throw new ArgumentNullException(nameof(tokenBreaker)); _weightedEntryCombinerOverride = weightedEntryCombinerOverride; _propertyWeightAppliers = propertyWeightAppliers ?? throw new ArgumentNullException(nameof(propertyWeightAppliers)); _tokenWeightDeterminerGeneratorOverride = tokenWeightDeterminerGeneratorOverride; _optionalPropertyForFirstContentRetriever = optionalPropertyForFirstContentRetriever; _captureSourceLocations = captureSourceLocations; _loggerOverride = loggerOverride; }
private static IndexData <TKey> RebuildIndexFromMatchDataAndReferencedKeys( BinaryReader reader, ImmutableList <TKey> keys, IStringNormaliser stringNormaliser, IEqualityComparer <TKey> keyComparer) { if (reader == null) { throw new ArgumentNullException("reader"); } if (keys == null) { throw new ArgumentNullException("keys"); } if (stringNormaliser == null) { throw new ArgumentNullException("stringNormaliser"); } if (keyComparer == null) { throw new ArgumentNullException("keyComparer"); } var numberOfTokens = reader.ReadInt32(); var matchDictionary = new Dictionary <string, NonNullImmutableList <WeightedEntry <TKey> > >(stringNormaliser); for (var tokenIndex = 0; tokenIndex < numberOfTokens; tokenIndex++) { var token = reader.ReadString(); var numberOfMatchesForToken = reader.ReadInt32(); var matches = NonNullImmutableList <WeightedEntry <TKey> > .Empty; for (var matchIndex = 0; matchIndex < numberOfMatchesForToken; matchIndex++) { var keyIndex = reader.ReadInt32(); if ((keyIndex < 0) || (keyIndex >= keys.Count)) { throw new Exception("Invalid keyIndex (" + keyIndex + ")"); } var matchWeight = reader.ReadSingle(); var numberOfSourceLocations = reader.ReadInt32(); NonNullImmutableList <SourceFieldLocation> sourceLocationsIfRecorded; if (numberOfSourceLocations == 0) { sourceLocationsIfRecorded = null; } else { sourceLocationsIfRecorded = NonNullImmutableList <SourceFieldLocation> .Empty; for (var sourceLocationIndex = 0; sourceLocationIndex < numberOfSourceLocations; sourceLocationIndex++) { sourceLocationsIfRecorded = sourceLocationsIfRecorded.Add( new SourceFieldLocation( reader.ReadInt32(), reader.ReadInt32(), reader.ReadInt32(), reader.ReadInt32(), reader.ReadSingle() ) ); } } matches = matches.Add( new WeightedEntry <TKey>( keys[keyIndex], matchWeight, sourceLocationsIfRecorded ) ); } matchDictionary.Add(token, matches); } return(new IndexData <TKey>( new TernarySearchTreeDictionary <NonNullImmutableList <WeightedEntry <TKey> > >( matchDictionary, stringNormaliser ), keyComparer )); }
private static IIndexData <int> GenerateIndexData(NonNullImmutableList <Post> posts, IStringNormaliser sourceStringComparer, ITokenBreaker tokenBreaker) { if (posts == null) { throw new ArgumentNullException(nameof(posts)); } if (sourceStringComparer == null) { throw new ArgumentNullException(nameof(sourceStringComparer)); } if (tokenBreaker == null) { throw new ArgumentNullException(nameof(tokenBreaker)); } // The Post (plain text) content is always the first field since its Content Retriever is first, this means that all source locations for the content // will have an SourceFieldIndex of zero var contentRetrievers = new List <ContentRetriever <Post, int> > { new ContentRetriever <Post, int>( p => new PreBrokenContent <int>(p.Id, p.GetContentAsPlainText()), GetTokenWeightDeterminer(1f, sourceStringComparer) ), new ContentRetriever <Post, int>( p => new PreBrokenContent <int>(p.Id, p.Title), GetTokenWeightDeterminer(5f, sourceStringComparer) ), new ContentRetriever <Post, int>( p => new PreBrokenContent <int>(p.Id, new NonNullOrEmptyStringList(p.Tags.Select(tag => tag.Tag))), GetTokenWeightDeterminer(3f, sourceStringComparer) ) }; return(new IndexGenerator <Post, int>( contentRetrievers.ToNonNullImmutableList(), new DefaultEqualityComparer <int>(), sourceStringComparer, tokenBreaker, weightedValues => weightedValues.Sum(), captureSourceLocations: true, new NullLogger() ).Generate(posts.ToNonNullImmutableList())); }
private static ContentRetriever <Post, int> .BrokenTokenWeightDeterminer GetTokenWeightDeterminer(float multiplier, IStringNormaliser sourceStringComparer) { if (multiplier <= 0) { throw new ArgumentOutOfRangeException(nameof(multiplier), "must be greater than zero"); } if (sourceStringComparer == null) { throw new ArgumentNullException(nameof(sourceStringComparer)); } // Constructing a HashSet of the normalised versions of the stop words means that looking up whether normalised tokens are stop // words can be a lot faster (as neither the stop words nor the token need to be fed through the normaliser again) var hashSetOfNormalisedStopWords = new HashSet <string>( FullTextIndexer.Core.Constants.GetStopWords("en").Select(word => sourceStringComparer.GetNormalisedString(word)) ); return(normalisedToken => multiplier * (hashSetOfNormalisedStopWords.Contains(normalisedToken) ? 0.01f : 1f)); }
public TernarySearchTreeStructDictionary(IEnumerable <KeyValuePair <string, TValue> > data, IStringNormaliser keyNormaliser) { if (data == null) { throw new ArgumentNullException("data"); } if (keyNormaliser == null) { throw new ArgumentNullException("keyNormaliser"); } var nodes = new List <Node> { GetUnintialisedNode() }; var values = new List <TValue>(); var keys = new HashSet <string>(keyNormaliser); foreach (var entry in data) { var key = entry.Key; if (key == null) { throw new ArgumentException("Null key encountered in data"); } var normalisedKey = keyNormaliser.GetNormalisedString(key); if (normalisedKey == "") { throw new ArgumentException("key value results in blank string when normalised: " + key); } if (keys.Contains(normalisedKey)) { throw new ArgumentException("key value results in duplicate normalised key:" + key); } keys.Add(key); if (nodes[0].Character == (char)0) { nodes[0] = SetCharacter(nodes[0], normalisedKey[0]); } var nodeIndex = 0; var keyIndex = 0; while (true) { if (nodes[nodeIndex].Character == normalisedKey[keyIndex]) { keyIndex++; if (keyIndex == normalisedKey.Length) { var newValueIndex = values.Count; values.Add(entry.Value); nodes[nodeIndex] = SetValueIndex(nodes[nodeIndex], newValueIndex); break; } if (nodes[nodeIndex].MiddleChildIndex == -1) { var newNode = SetCharacter(GetUnintialisedNode(), normalisedKey[keyIndex]); var newNodeIndex = nodes.Count; nodes.Add(newNode); nodes[nodeIndex] = SetMiddleChildIndex(nodes[nodeIndex], newNodeIndex); nodeIndex = newNodeIndex; } else { nodeIndex = nodes[nodeIndex].MiddleChildIndex; } continue; } else if (normalisedKey[keyIndex] < nodes[nodeIndex].Character) { if (nodes[nodeIndex].LeftChildIndex == -1) { var newNode = SetCharacter(GetUnintialisedNode(), normalisedKey[keyIndex]); var newNodeIndex = nodes.Count; nodes.Add(newNode); nodes[nodeIndex] = SetLeftChildIndex(nodes[nodeIndex], newNodeIndex); nodeIndex = newNodeIndex; } else { nodeIndex = nodes[nodeIndex].LeftChildIndex; } } else { if (nodes[nodeIndex].RightChildIndex == -1) { var newNode = SetCharacter(GetUnintialisedNode(), normalisedKey[keyIndex]); var newNodeIndex = nodes.Count; nodes.Add(newNode); nodes[nodeIndex] = SetRightChildIndex(nodes[nodeIndex], newNodeIndex); nodeIndex = newNodeIndex; } else { nodeIndex = nodes[nodeIndex].RightChildIndex; } } } } _nodes = nodes.ToArray(); _values = values.ToArray(); _keyNormaliser = keyNormaliser; _keys = keys.ToList().AsReadOnly(); }
public AutomatedIndexGeneratorFactoryBuilder <TSource, TKey> SetStringNormaliser(IStringNormaliser stringNormaliser) { if (stringNormaliser == null) { throw new ArgumentNullException("stringNormaliser"); } return(new AutomatedIndexGeneratorFactoryBuilder <TSource, TKey>( _keyRetrieverOverride, _keyComparerOverride, stringNormaliser, _tokenBreaker, _weightedEntryCombinerOverride, _propertyWeightAppliers, _tokenWeightDeterminerGeneratorOverride, _optionalPropertyForFirstContentRetriever, _captureSourceLocations, _loggerOverride )); }
private AutomatedIndexGeneratorFactory <TSource, TKey> .WeightDeterminerGenerator GetDefaultTokenWeightDeterminerGenerator(IStringNormaliser stringNormaliser) { if (stringNormaliser == null) { throw new ArgumentNullException("stringNormaliser"); } // Constructing a HashSet of the normalised versions of the stop words means that looking up whether normalised tokens are stop // words can be a lot faster (as neither the stop words nor the token need to be fed through the normaliser again) var hashSetOfNormalisedStopWords = new HashSet <string>( Constants.GetStopWords("en").Select(word => stringNormaliser.GetNormalisedString(word)) ); return(property => { // Reverse the propertyWeightAppliers so that later values added to the set take precedence (eg. if, for some reason, a x5 weight is // given to a property and then later it's set to be ignored, then we want to ignore it - which this will achieve) var propertyWeightApplier = _propertyWeightAppliers.Reverse().FirstOrDefault(p => p.AppliesTo(property)); if ((propertyWeightApplier != null) && (propertyWeightApplier.WeightMultiplier == 0)) { // A weight multiplier of zero means ignore this property, as does returning null from a WeightDeterminerGenerator call return null; } var weightMultiplier = (propertyWeightApplier != null) ? propertyWeightApplier.WeightMultiplier : 1; return normalisedToken => weightMultiplier * (hashSetOfNormalisedStopWords.Contains(normalisedToken) ? 0.01f : 1f); }); }
/// <summary> /// Combine additional data with an existing root's content, returning a new Node (unless zero data entries were specified, in which case the original /// reference will be returned). This will throw an exception for a null keyNormalised or data (or if any keys in the data return null, empty string /// or duplicates when run through the keyNormaliser). If a null root is specified then a new root will be generated. /// </summary> private static Node Add(Node root, IStringNormaliser keyNormaliser, IEnumerable <KeyValuePair <string, TValue> > data) { if (keyNormaliser == null) { throw new ArgumentNullException("keyNormaliser"); } if (data == null) { throw new ArgumentNullException("keys"); } if (!data.Any()) { return(root); } if (root != null) { root = root.Clone(); } foreach (var entry in data) { var key = entry.Key; if (key == null) { throw new ArgumentException("Null key encountered in data"); } var normalisedKey = keyNormaliser.GetNormalisedString(key); if (normalisedKey == "") { throw new ArgumentException("key value results in blank string when normalised: " + key); } if (root == null) { root = new Node() { Character = normalisedKey[0] } } ; var node = root; var index = 0; while (true) { if (node.Character == normalisedKey[index]) { index++; if (index == normalisedKey.Length) { node.Key = normalisedKey; node.Value = entry.Value; break; } if (node.MiddleChild == null) { node.MiddleChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.MiddleChild; } else if (normalisedKey[index] < node.Character) { if (node.LeftChild == null) { node.LeftChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.LeftChild; } else { if (node.RightChild == null) { node.RightChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.RightChild; } } } return(root); }
public UserRepository(IOptions <MongoDbSettings> dbSettings, IStringNormaliser normaliser) : base(dbSettings, normaliser) { }
public EnglishPluralityStringNormaliser(IStringNormaliser optionalPreNormaliser, PreNormaliserWorkOptions preNormaliserWork) : this(DefaultPlurals, optionalPreNormaliser, preNormaliserWork) { }
private static Node Add(Node root, IStringNormaliser keyNormaliser, IEnumerable <KeyValuePair <string, TValue> > data, Combiner combine) { if (keyNormaliser == null) { throw new ArgumentNullException("keyNormaliser"); } if (data == null) { throw new ArgumentNullException("keys"); } if (combine == null) { throw new ArgumentNullException(nameof(combine)); } if (!data.Any()) { return(root); } if (root != null) { root = root.Clone(); } foreach (var entry in data) { var key = entry.Key; if (key == null) { throw new ArgumentException("Null key encountered in data"); } var normalisedKey = keyNormaliser.GetNormalisedString(key); if (normalisedKey == "") { throw new ArgumentException("key value results in blank string when normalised: " + key); } if (root == null) { root = new Node() { Character = normalisedKey[0] } } ; var node = root; var index = 0; while (true) { if (node.Character == normalisedKey[index]) { index++; if (index == normalisedKey.Length) { node.Value = ((node.Value != null) && (entry.Value != null)) ? combine(node.Value, entry.Value) : ((node.Value != null) ? node.Value : entry.Value); node.Key = (node.Value == null) ? null : normalisedKey; // If we ended up with a null Value then the Combiner may have removed it, in which case we should set the Key to null as well break; } if (node.MiddleChild == null) { node.MiddleChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.MiddleChild; } else if (normalisedKey[index] < node.Character) { if (node.LeftChild == null) { node.LeftChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.LeftChild; } else { if (node.RightChild == null) { node.RightChild = new Node() { Character = normalisedKey[index], Parent = node } } ; node = node.RightChild; } } } return(root); }
public TernarySearchTreeDictionary(IEnumerable <KeyValuePair <string, TValue> > data, IStringNormaliser keyNormaliser) : this(Add(root : null, keyNormaliser : keyNormaliser, data : data, combine : _takeNewValue), keyNormaliser) { }