public Post( int id, DateTime posted, DateTime lastModified, string slug, NonNullOrEmptyStringList redirectFromSlugs, string title, bool isHighlight, string markdownContent, ImmutableList <int> relatedPosts, ImmutableList <int> autoSuggestedRelatedPosts, NonNullImmutableList <TagSummary> tags) : base(id, posted, lastModified, slug, title, isHighlight) { if (string.IsNullOrWhiteSpace(markdownContent)) { throw new ArgumentException("Null/blank markdownContent content"); } RedirectFromSlugs = redirectFromSlugs ?? throw new ArgumentNullException(nameof(redirectFromSlugs)); MarkdownContent = markdownContent; RelatedPosts = relatedPosts ?? throw new ArgumentNullException(nameof(relatedPosts)); AutoSuggestedRelatedPosts = autoSuggestedRelatedPosts ?? throw new ArgumentNullException(nameof(autoSuggestedRelatedPosts)); Tags = tags ?? throw new ArgumentNullException(nameof(tags)); }
public Post(int id, NonBlankTrimmedString title, NonBlankTrimmedString body, NonBlankTrimmedString author, DateTime publishedAt, NonNullOrEmptyStringList tags) { if (title == null) { throw new ArgumentNullException(nameof(title)); } if (body == null) { throw new ArgumentNullException(nameof(body)); } if (author == null) { throw new ArgumentNullException(nameof(author)); } if (tags == null) { throw new ArgumentNullException(nameof(tags)); } Id = id; Title = title; Body = body; Author = author; PublishedAt = publishedAt; Tags = tags; }
public PostWithRelatedPostStubs( int id, DateTime posted, DateTime lastModified, string slug, NonNullOrEmptyStringList redirectFromSlugs, string title, bool isHighlight, string markdownContent, NonNullImmutableList <PostStub> relatedPosts, NonNullImmutableList <PostStub> autoSuggestedRelatedPosts, NonNullImmutableList <TagSummary> tags) : base( id, posted, lastModified, slug, redirectFromSlugs, title, isHighlight, markdownContent, (relatedPosts ?? NonNullImmutableList <PostStub> .Empty).Select(p => p.Id).ToImmutableList(), (autoSuggestedRelatedPosts ?? NonNullImmutableList <PostStub> .Empty).Select(p => p.Id).ToImmutableList(), tags) { RelatedPosts = relatedPosts ?? throw new ArgumentNullException(nameof(relatedPosts)); AutoSuggestedRelatedPosts = autoSuggestedRelatedPosts ?? throw new ArgumentNullException(nameof(autoSuggestedRelatedPosts)); }
/// <summary> /// This will never return null, it will throw an exception for null input. /// </summary> public PostIndexContent GenerateIndexContent(NonNullImmutableList <Post> posts) { if (posts == null) { throw new ArgumentNullException(nameof(posts)); } // In common language, characters such as "." and "," indicate breaks in words (unlike "'" or "-" which are commonly part of words). // When generating an index from content that contains C# (or other similar languages) there are a raft of other characters which // need to be treated similarly. var whitespaceTokenBreaker = new WhiteSpaceExtendingTokenBreaker( new ImmutableList <char>(new[] { '<', '>', '[', ']', '(', ')', '{', '}', '.', ',', ':', ';', '"', '?', '!', '/', '\\', '@', '+', '|', '=' }), new WhiteSpaceTokenBreaker() ); // The Search Index data uses an EnglishPluralityStringNormaliser which removes a lot of content from strings but the token set will never // be visible to a site user, they will pass a string into the index to match and only see the results. var defaultIndexDataForSearching = GenerateIndexData( posts, new EnglishPluralityStringNormaliser( DefaultStringNormaliser.Instance, EnglishPluralityStringNormaliser.PreNormaliserWorkOptions.PreNormaliserLowerCases | EnglishPluralityStringNormaliser.PreNormaliserWorkOptions.PreNormaliserTrims ), whitespaceTokenBreaker ); // The AutoComplete content WILL be visible to the user and so we can't be as aggressive with the token normalisation. We'll start // off generating a token set in a similar manner as for the search index but instead of applying the EnglishPluralityStringNormaliser // and DefaultStringNormaliser we'll take the string unaltered and then filter out values that don't look like words that might need to // be searched on; those less than 3 characters, those that start with punctuation (eg. quoted values) or those that contains numbers. // Distinct values (ignoring case) will be taken and a final pass through the Search Index data will be done in case the difference in // token normalisation resulted in any AutoComplete words being produced that don't match anything when searched on (these are removed // from the results). The results are ordered alphabetically (again, ignoring case) to give the final content. var indexDataForAutoCompleteExtended = GenerateIndexData( posts, new NonAlteringStringNormaliser(), whitespaceTokenBreaker ); var autoCompleteContent = new NonNullOrEmptyStringList( indexDataForAutoCompleteExtended.GetAllTokens() .Select(token => token.Trim()) .Where(token => (token.Length >= 3) && !char.IsPunctuation(token[0]) && !token.Any(c => char.IsNumber(c))) .Distinct(StringComparer.OrdinalIgnoreCase) .Where(token => defaultIndexDataForSearching.GetMatches(token).Any()) .OrderBy(token => token.ToLower()) ); return(new PostIndexContent(defaultIndexDataForSearching, autoCompleteContent)); }
public PreBrokenContent(TKey key, NonNullOrEmptyStringList content) { if (key == null) { throw new ArgumentNullException("key"); } if (content == null) { throw new ArgumentNullException("content"); } Key = key; Content = content; }
/// <summary> /// This will break down a source search term into words (according to the logic of the specified token breaker) and then return matches where the words were found in a run in a /// content section. Unlike GetPartialMatches it is not possible for an entry to be considered a match because it contains all of the terms in its content, the terms must be /// present in one content field, together, in the order in which they are present in the search term. This allows for similar behaviour to that intended for the /// ConsecutiveTokenCombiningTokenBreaker, but this offers greater performance (constructing a TernarySearchTreeDictionary to back an IndexData instance can be expensive on /// processing time to generate, and disk / memory space to store, the runs of tokens). This also has the benefit that there is no cap on the number of tokens that can be /// matched consecutively (a limit on this had to be decided at index generation time when using the ConsecutiveTokenCombiningTokenBreaker). There are two sets of weight /// combining calculations required; the first (handled by the weightCombinerForConsecutiveRuns) determines a weight for run of consecutive tokens - each run is considered /// a single match, effectively. Each call to the first weight comber will have as many weights to combine as there are search terms, so if the "source" value is broken /// down into three words by the tokenBreaker then the weightCombinerForConsecutiveRuns will always be called with sets of three weights. The second weight combination /// is performed when multiple matches for a particular result must be combined to give a final match weight for that result. /// /// Note: This requires the index to have been built with source location data recorded - if the index's SourceLocationsAvailable property returns false then an ArgumentException /// will be thrown. /// </summary> public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>( this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker, IndexGenerator.WeightedEntryCombiner weightCombinerForConsecutiveRuns, IndexGenerator.WeightedEntryCombiner weightCombinerForFinalMatches) { if (index == null) { throw new ArgumentNullException("index"); } if (source == null) { throw new ArgumentNullException("source"); } if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (weightCombinerForConsecutiveRuns == null) { throw new ArgumentNullException("weightCombinerForConsecutiveRuns"); } if (weightCombinerForFinalMatches == null) { throw new ArgumentNullException("weightCombinerForFinalMatches"); } if (!index.SourceLocationsAvailable) { throw new ArgumentException($"The {nameof(index)} must include source location data in order to use identify Consecutive token matches"); } // If the token breaker won't actually translate the source value into multiple words then we can avoid all of the below work and just call index.GetMatches directly var weightAdjustedTokens = tokenBreaker.Break(source); if (weightAdjustedTokens.Count == 1) { return(index.GetMatches(source)); } // The index of this list will correspond to the index of the broken-down search terms var matchesForSearchTerms = new List <WeightedEntry <TKey>[]>(); foreach (var weightAdjustedToken in weightAdjustedTokens) { matchesForSearchTerms.Add( index.GetMatches(weightAdjustedToken.Token).Select(w => new WeightedEntry <TKey>( w.Key, w.Weight * weightAdjustedToken.WeightMultiplier, w.SourceLocationsIfRecorded )).ToArray() ); } // For each match of the first search term, try to identify a run of token matches for the same key and source field. Any such runs will be recorded in the consecutiveMatches // list - these represent content segments that match the entirety of the search term (the "source" argument). var consecutiveMatches = new List <WeightedEntry <TKey> >(); var searchTerms = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token)); foreach (var firstTermMatch in matchesForSearchTerms.First().SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations(m))) { var matchesForEntireTerm = NonNullImmutableList <WeightedEntry <TKey> > .Empty; matchesForEntireTerm = matchesForEntireTerm.Add(firstTermMatch); for (var termIndex = 1; termIndex < weightAdjustedTokens.Count; termIndex++) { // Note: SourceLocationsIfRecorded should never be null because we checked that the index reported that SourceLocationsAvailable was true (so not checking for null // source locations here) var nTermMatch = matchesForSearchTerms[termIndex] .SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations <TKey>(m)) .FirstOrDefault(m => index.KeyComparer.Equals(m.Key, firstTermMatch.Key) && (m.SourceLocationsIfRecorded.First().SourceFieldIndex == firstTermMatch.SourceLocationsIfRecorded.First().SourceFieldIndex) && (m.SourceLocationsIfRecorded.First().TokenIndex == firstTermMatch.SourceLocationsIfRecorded.First().TokenIndex + termIndex) ); if (nTermMatch == null) { break; } matchesForEntireTerm = matchesForEntireTerm.Add(nTermMatch); } if (matchesForEntireTerm.Count < weightAdjustedTokens.Count) { // If we didn't manage to get a full set of search terms then this isn't a full match continue; } // Combine the WeightedEntry instances that represent a run of individual matches (one for each word in the "source" argument) into a single WeightedEntry that represents // the entirety of the search term (each of the matchesForEntireTerm WeightedEntry instances will have only a single Source Location since the match data was split up // above by calling BreakWeightedEntryIntoIndividualSourceLocations before trying to find the consecutive matches). See notes above about not checking whether // SourceLocationsIfRecorded is null (it shouldn't be because we index.SourceLocationsAvailable at the top of this method) var sourceLocationOfFirstTerm = matchesForEntireTerm.First().SourceLocationsIfRecorded.Single(); var sourceLocationOfLastTerm = matchesForEntireTerm.Last().SourceLocationsIfRecorded.Single(); var matchWeightForConsecutiveRunEntry = weightCombinerForConsecutiveRuns( matchesForEntireTerm.Select(m => m.Weight).ToImmutableList() ); consecutiveMatches.Add( new WeightedEntry <TKey>( matchesForEntireTerm.First().Key, matchWeightForConsecutiveRunEntry, new NonNullImmutableList <SourceFieldLocation>(new[] { // Since we're creating a new SourceFieldLocation instance that is derived from a run of multiple tokens, the TokenIndex is going to be an approximation - // taking the TokenIndex from the first search term probably makes the most sense. The SourceIndex and SourceTokenLength will be taken such that the entire // run is covered (from the start of the first search term to the end of the last). Since this is the only Source Location instance for the WeightedEntry, // its MatchWeightContribution value is equal to the WeightedEntry's Weight. new SourceFieldLocation( sourceLocationOfFirstTerm.SourceFieldIndex, sourceLocationOfFirstTerm.TokenIndex, sourceLocationOfFirstTerm.SourceIndex, (sourceLocationOfLastTerm.SourceIndex + sourceLocationOfLastTerm.SourceTokenLength) - sourceLocationOfFirstTerm.SourceIndex, matchWeightForConsecutiveRunEntry ) }) ) ); } // The matches need grouping by key before returning return(consecutiveMatches .GroupBy(m => m.Key, index.KeyComparer) .Cast <IEnumerable <WeightedEntry <TKey> > >() .Select(matches => new WeightedEntry <TKey>( matches.First().Key, weightCombinerForFinalMatches( matches.Select(match => match.Weight).ToImmutableList() ), matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList() )) .ToNonNullImmutableList()); }
/// <summary> /// This will never return null nor contain any null entries /// </summary> public async Task <NonNullImmutableList <Post> > Get() { // The redirects set contains tuples From, To slugs (blank lines and those starting with a "#" are ignored, as are any that don't have any whitespace) const string redirectsFilename = "Redirects.txt"; var redirectsFile = _folder.FirstOrDefault(file => file.Name.Equals(redirectsFilename, StringComparison.OrdinalIgnoreCase)); IEnumerable <Tuple <string, string> > redirects; if (redirectsFile == null) { redirects = new List <Tuple <string, string> >(); } else { redirects = (await ReadFileContents(redirectsFile)) .Replace("\r\n", "\n") .Replace("\r", "\n") .Split('\n') .Select(entry => entry.Trim()) .Where(entry => (entry != "") && !entry.StartsWith("#") && entry.Any(c => char.IsWhiteSpace(c))) .Select(entry => new string(entry.Select(c => char.IsWhiteSpace(c) ? ' ' : c).ToArray())) .Select(entry => entry.Split(new[] { ' ' }, 2)) .Select(values => Tuple.Create(values[0], values[1])); } // The relatedPostRelationships set contains a map of Post Id to Ids of related Posts (in the order that they should appear) const string relatedPostsFilename = "RelatedPosts.txt"; var relatedPostsFile = _folder.FirstOrDefault(file => file.Name.Equals(relatedPostsFilename, StringComparison.OrdinalIgnoreCase)); var relatedPostRelationships = (relatedPostsFile == null) ? new Dictionary <int, ImmutableList <int> >() : await ReadRedirects(relatedPostsFile); // There is similar data in the AutoSuggestedRelatedPosts.txt file but the manually-created RelatedPosts.txt should take precedence in cases // where Post Ids appear in both const string autoSuggestedRelatedPostsFilename = "AutoSuggestedRelatedPosts.txt"; var autoSuggestedRelatedPostsFile = _folder.FirstOrDefault(file => file.Name.Equals(autoSuggestedRelatedPostsFilename, StringComparison.OrdinalIgnoreCase)); var autoSuggestedRelatedPostRelationships = (autoSuggestedRelatedPostsFile == null) ? new Dictionary <int, ImmutableList <int> >() : await ReadRedirects(autoSuggestedRelatedPostsFile); // We can use this functionality from the FullTextIndexer to generate the Post slug (it will replace accented characters, normalise whitespace, // remove punctuation and lower case the content - all we need to do then is replace spaces with hypens) var stringNormaliser = DefaultStringNormaliser.Instance; var posts = new List <Post>(); foreach (var file in _folder.Where(file => file.Name.EndsWith(".txt", StringComparison.OrdinalIgnoreCase))) { if (file.Name.Equals(redirectsFilename, StringComparison.InvariantCultureIgnoreCase)) { continue; } var fileSummary = TryToGetFileSummaryEntry(file.Name); if (fileSummary != null) { var fileContents = await ReadFileContents(file); var title = TryToGetTitle(fileContents); if (title != null) { // 2014-09-17 DWR: Titles such as "C# State Machines" were being converted into "c-state-machines" which isn't as descriptive as // I'd like, "c-sharp-state-machines" is better. The replacement is done for "C#" and "F#" (a space is required after the // replacement content otherwise the "sharp" gets rolled into the next word) var slugBase = title.Replace("C#", "c sharp ").Replace("F#", "f sharp "); var slug = stringNormaliser.GetNormalisedString(slugBase).Replace(' ', '-'); var redirectsForPost = new NonNullOrEmptyStringList( redirects.Where(r => r.Item2 == slug).Select(r => r.Item1) ); // On this pass, set every tag's NumberOfPosts value to one since we don't have enough data to know better. After all of the // posts have been loaded, this can be fixed before the method terminates. if (!relatedPostRelationships.TryGetValue(fileSummary.Id, out var relatedPosts)) { relatedPosts = null; } if ((relatedPosts != null) || !autoSuggestedRelatedPostRelationships.TryGetValue(fileSummary.Id, out var autoSuggestedRelatedPosts)) { // Only check the autoSuggestedRelatedPostRelationships if there was no relatedPostRelationships entry - this allows for posts // to be specified as having no suggestions (manually-specified or auto-suggested) by having an entry in the manually-specified // file that has the post id but zero suggestions. autoSuggestedRelatedPosts = null; } posts.Add(new Post( fileSummary.Id, fileSummary.PostDate, file.LastModified.DateTime, slug, redirectsForPost, title, fileSummary.IsHighlight, fileContents, relatedPosts ?? new ImmutableList <int>(), autoSuggestedRelatedPosts ?? new ImmutableList <int>(), fileSummary.Tags.Select(tag => new TagSummary(tag, 1)).ToNonNullImmutableList() )); } } } var tagCounts = posts .SelectMany(post => post.Tags) .Select(tag => tag.Tag) .GroupBy(tag => tag, StringComparer.OrdinalIgnoreCase) .ToDictionary(groupedTag => groupedTag.Key, groupedTag => groupedTag.Count(), StringComparer.OrdinalIgnoreCase); return(new NonNullImmutableList <Post>(posts.Select(post => new Post( post.Id, post.Posted, post.LastModified, post.Slug, post.RedirectFromSlugs, post.Title, post.IsHighlight, post.MarkdownContent, post.RelatedPosts, post.AutoSuggestedRelatedPosts, post.Tags.Select(tag => new TagSummary(tag.Tag, tagCounts[tag.Tag])).ToNonNullImmutableList() ) ))); }
public PostIndexContent(IIndexData <int> searchIndex, NonNullOrEmptyStringList autoCompleteContent) { _searchIndex = searchIndex ?? throw new ArgumentNullException(nameof(searchIndex)); AutoCompleteContent = autoCompleteContent ?? throw new ArgumentNullException(nameof(autoCompleteContent)); }
/// <summary> /// This will break a given source string and return results based upon the combination of partial matches (so results that only match part of the source string may be included /// in the returned data). The token breaker and the match combiner must be specified by the caller - if the match combiner returns zero then the result will not be included in /// the final data. To require that all tokens in the source content be present for any returned results, the following matchCombiner could be specified: /// (tokenMatches, allTokens) => (tokenMatches.Count < allTokens.Count) ? 0 : tokenMatches.Sum(m => m.Weight) /// </summary> public static NonNullImmutableList <WeightedEntry <TKey> > GetPartialMatches <TKey>( this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker, WeightCombiner weightCombiner) { if (index == null) { throw new ArgumentNullException("index"); } if (source == null) { throw new ArgumentNullException("source"); } if (tokenBreaker == null) { throw new ArgumentNullException("tokenBreaker"); } if (weightCombiner == null) { throw new ArgumentNullException("weightCombiner"); } // Break down the "source" search term and find matches for each token // - Each match maintains the weight multiplier applied to the string segment from the token breaker // - The Source Locations are annotated with additional data; the source segment string and what token index that is (so if the "source" value is broken into three, then // each Source Location will have a SearchTerm property whose TokenIndex will be between 0 and 2, inclusive). This allows for a weightCombiner to be specified that // ensures that every token that was extract from the source value can be matched against a given result, if so desired. var matches = new List <Tuple <WeightedEntry <TKey>, SearchTermDetails> >(); var weightAdjustedTokens = tokenBreaker.Break(source); for (var tokenIndex = 0; tokenIndex < weightAdjustedTokens.Count; tokenIndex++) { var weightAdjustedToken = weightAdjustedTokens[tokenIndex]; matches.AddRange( index .GetMatches(weightAdjustedToken.Token) .Select(match => Tuple.Create(match, new SearchTermDetails(tokenIndex, weightAdjustedToken.Token))) ); } // Combine per-search-term results, grouping by result key and calculating the match weight for each token using the specified weightCombiner (this may also be // used to filter out results; if a match weight of zero is returned then the match will be ignored - this may used to filter out results that only match two // out of three of the search terms, for example) var finalResults = NonNullImmutableList <WeightedEntry <TKey> > .Empty; var searchTerms = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token)); foreach (var matchesGroupedByKey in matches.GroupBy(m => m.Item1.Key, index.KeyComparer).Cast <IEnumerable <Tuple <WeightedEntry <TKey>, SearchTermDetails> > >()) { var combinedWeight = weightCombiner( matchesGroupedByKey .Select(m => new MatchWeightWithSourceFieldLocations( m.Item1.Weight, m.Item2, m.Item1.SourceLocationsIfRecorded )).ToNonNullImmutableList(), searchTerms ); if (combinedWeight < 0) { throw new ArgumentException("weightCombiner returned a negative value - invalid"); } else if (combinedWeight > 0) { finalResults = finalResults.Add( new WeightedEntry <TKey>( matchesGroupedByKey.First().Item1.Key, combinedWeight, matchesGroupedByKey.Any(m => m.Item1.SourceLocationsIfRecorded == null) ? null : matchesGroupedByKey.SelectMany(m => m.Item1.SourceLocationsIfRecorded).ToNonNullImmutableList() ) ); } } return(finalResults); }