Esempio n. 1
0
        public Post(
            int id,
            DateTime posted,
            DateTime lastModified,
            string slug,
            NonNullOrEmptyStringList redirectFromSlugs,
            string title,
            bool isHighlight,
            string markdownContent,
            ImmutableList <int> relatedPosts,
            ImmutableList <int> autoSuggestedRelatedPosts,
            NonNullImmutableList <TagSummary> tags)
            : base(id, posted, lastModified, slug, title, isHighlight)
        {
            if (string.IsNullOrWhiteSpace(markdownContent))
            {
                throw new ArgumentException("Null/blank markdownContent content");
            }

            RedirectFromSlugs         = redirectFromSlugs ?? throw new ArgumentNullException(nameof(redirectFromSlugs));
            MarkdownContent           = markdownContent;
            RelatedPosts              = relatedPosts ?? throw new ArgumentNullException(nameof(relatedPosts));
            AutoSuggestedRelatedPosts = autoSuggestedRelatedPosts ?? throw new ArgumentNullException(nameof(autoSuggestedRelatedPosts));
            Tags = tags ?? throw new ArgumentNullException(nameof(tags));
        }
Esempio n. 2
0
        public Post(int id, NonBlankTrimmedString title, NonBlankTrimmedString body, NonBlankTrimmedString author, DateTime publishedAt, NonNullOrEmptyStringList tags)
        {
            if (title == null)
            {
                throw new ArgumentNullException(nameof(title));
            }
            if (body == null)
            {
                throw new ArgumentNullException(nameof(body));
            }
            if (author == null)
            {
                throw new ArgumentNullException(nameof(author));
            }
            if (tags == null)
            {
                throw new ArgumentNullException(nameof(tags));
            }

            Id          = id;
            Title       = title;
            Body        = body;
            Author      = author;
            PublishedAt = publishedAt;
            Tags        = tags;
        }
 public PostWithRelatedPostStubs(
     int id,
     DateTime posted,
     DateTime lastModified,
     string slug,
     NonNullOrEmptyStringList redirectFromSlugs,
     string title,
     bool isHighlight,
     string markdownContent,
     NonNullImmutableList <PostStub> relatedPosts,
     NonNullImmutableList <PostStub> autoSuggestedRelatedPosts,
     NonNullImmutableList <TagSummary> tags)
     : base(
         id,
         posted,
         lastModified,
         slug,
         redirectFromSlugs,
         title,
         isHighlight,
         markdownContent,
         (relatedPosts ?? NonNullImmutableList <PostStub> .Empty).Select(p => p.Id).ToImmutableList(),
         (autoSuggestedRelatedPosts ?? NonNullImmutableList <PostStub> .Empty).Select(p => p.Id).ToImmutableList(),
         tags)
 {
     RelatedPosts = relatedPosts ?? throw new ArgumentNullException(nameof(relatedPosts));
     AutoSuggestedRelatedPosts = autoSuggestedRelatedPosts ?? throw new ArgumentNullException(nameof(autoSuggestedRelatedPosts));
 }
Esempio n. 4
0
        /// <summary>
        /// This will never return null, it will throw an exception for null input.
        /// </summary>
        public PostIndexContent GenerateIndexContent(NonNullImmutableList <Post> posts)
        {
            if (posts == null)
            {
                throw new ArgumentNullException(nameof(posts));
            }

            // In common language, characters such as "." and "," indicate breaks in words (unlike "'" or "-" which are commonly part of words).
            // When generating an index from content that contains C# (or other similar languages) there are a raft of other characters which
            // need to be treated similarly.
            var whitespaceTokenBreaker = new WhiteSpaceExtendingTokenBreaker(
                new ImmutableList <char>(new[] {
                '<', '>', '[', ']', '(', ')', '{', '}',
                '.', ',', ':', ';', '"', '?', '!',
                '/', '\\',
                '@', '+', '|', '='
            }),
                new WhiteSpaceTokenBreaker()
                );

            // The Search Index data uses an EnglishPluralityStringNormaliser which removes a lot of content from strings but the token set will never
            // be visible to a site user, they will pass a string into the index to match and only see the results.
            var defaultIndexDataForSearching = GenerateIndexData(
                posts,
                new EnglishPluralityStringNormaliser(
                    DefaultStringNormaliser.Instance,
                    EnglishPluralityStringNormaliser.PreNormaliserWorkOptions.PreNormaliserLowerCases
                    | EnglishPluralityStringNormaliser.PreNormaliserWorkOptions.PreNormaliserTrims
                    ),
                whitespaceTokenBreaker
                );

            // The AutoComplete content WILL be visible to the user and so we can't be as aggressive with the token normalisation. We'll start
            // off generating a token set in a similar manner as for the search index but instead of applying the EnglishPluralityStringNormaliser
            // and DefaultStringNormaliser we'll take the string unaltered and then filter out values that don't look like words that might need to
            // be searched on; those less than 3 characters, those that start with punctuation (eg. quoted values) or those that contains numbers.
            // Distinct values (ignoring case) will be taken and a final pass through the Search Index data will be done in case the difference in
            // token normalisation resulted in any AutoComplete words being produced that don't match anything when searched on (these are removed
            // from the results). The results are ordered alphabetically (again, ignoring case) to give the final content.
            var indexDataForAutoCompleteExtended = GenerateIndexData(
                posts,
                new NonAlteringStringNormaliser(),
                whitespaceTokenBreaker
                );
            var autoCompleteContent = new NonNullOrEmptyStringList(
                indexDataForAutoCompleteExtended.GetAllTokens()
                .Select(token => token.Trim())
                .Where(token => (token.Length >= 3) && !char.IsPunctuation(token[0]) && !token.Any(c => char.IsNumber(c)))
                .Distinct(StringComparer.OrdinalIgnoreCase)
                .Where(token => defaultIndexDataForSearching.GetMatches(token).Any())
                .OrderBy(token => token.ToLower())
                );

            return(new PostIndexContent(defaultIndexDataForSearching, autoCompleteContent));
        }
        public PreBrokenContent(TKey key, NonNullOrEmptyStringList content)
        {
            if (key == null)
            {
                throw new ArgumentNullException("key");
            }
            if (content == null)
            {
                throw new ArgumentNullException("content");
            }

            Key     = key;
            Content = content;
        }
Esempio n. 6
0
        /// <summary>
        /// This will break down a source search term into words (according to the logic of the specified token breaker) and then return matches where the words were found in a run in a
        /// content section. Unlike GetPartialMatches it is not possible for an entry to be considered a match because it contains all of the terms in its content, the terms must be
        /// present in one content field, together, in the order in which they are present in the search term. This allows for similar behaviour to that intended for the
        /// ConsecutiveTokenCombiningTokenBreaker, but this offers greater performance (constructing a TernarySearchTreeDictionary to back an IndexData instance can be expensive on
        /// processing time to generate, and disk / memory space to store, the runs of tokens). This also has the benefit that there is no cap on the number of tokens that can be
        /// matched consecutively (a limit on this had to be decided at index generation time when using the ConsecutiveTokenCombiningTokenBreaker). There are two sets of weight
        /// combining calculations required; the first (handled by the weightCombinerForConsecutiveRuns) determines a weight for run of consecutive tokens - each run is considered
        /// a single match, effectively. Each call to the first weight comber will have as many weights to combine as there are search terms, so if the "source" value is broken
        /// down into three words by the tokenBreaker then the weightCombinerForConsecutiveRuns will always be called with sets of three weights. The second weight combination
        /// is performed when multiple matches for a particular result must be combined to give a final match weight for that result.
        ///
        /// Note: This requires the index to have been built with source location data recorded - if the index's SourceLocationsAvailable property returns false then an ArgumentException
        /// will be thrown.
        /// </summary>
        public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>(
            this IIndexData <TKey> index,
            string source,
            ITokenBreaker tokenBreaker,
            IndexGenerator.WeightedEntryCombiner weightCombinerForConsecutiveRuns,
            IndexGenerator.WeightedEntryCombiner weightCombinerForFinalMatches)
        {
            if (index == null)
            {
                throw new ArgumentNullException("index");
            }
            if (source == null)
            {
                throw new ArgumentNullException("source");
            }
            if (tokenBreaker == null)
            {
                throw new ArgumentNullException("tokenBreaker");
            }
            if (weightCombinerForConsecutiveRuns == null)
            {
                throw new ArgumentNullException("weightCombinerForConsecutiveRuns");
            }
            if (weightCombinerForFinalMatches == null)
            {
                throw new ArgumentNullException("weightCombinerForFinalMatches");
            }

            if (!index.SourceLocationsAvailable)
            {
                throw new ArgumentException($"The {nameof(index)} must include source location data in order to use identify Consecutive token matches");
            }

            // If the token breaker won't actually translate the source value into multiple words then we can avoid all of the below work and just call index.GetMatches directly
            var weightAdjustedTokens = tokenBreaker.Break(source);

            if (weightAdjustedTokens.Count == 1)
            {
                return(index.GetMatches(source));
            }

            // The index of this list will correspond to the index of the broken-down search terms
            var matchesForSearchTerms = new List <WeightedEntry <TKey>[]>();

            foreach (var weightAdjustedToken in weightAdjustedTokens)
            {
                matchesForSearchTerms.Add(
                    index.GetMatches(weightAdjustedToken.Token).Select(w => new WeightedEntry <TKey>(
                                                                           w.Key,
                                                                           w.Weight * weightAdjustedToken.WeightMultiplier,
                                                                           w.SourceLocationsIfRecorded
                                                                           )).ToArray()
                    );
            }

            // For each match of the first search term, try to identify a run of token matches for the same key and source field. Any such runs will be recorded in the consecutiveMatches
            // list - these represent content segments that match the entirety of the search term (the "source" argument).
            var consecutiveMatches = new List <WeightedEntry <TKey> >();
            var searchTerms        = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token));

            foreach (var firstTermMatch in matchesForSearchTerms.First().SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations(m)))
            {
                var matchesForEntireTerm = NonNullImmutableList <WeightedEntry <TKey> > .Empty;
                matchesForEntireTerm = matchesForEntireTerm.Add(firstTermMatch);
                for (var termIndex = 1; termIndex < weightAdjustedTokens.Count; termIndex++)
                {
                    // Note: SourceLocationsIfRecorded should never be null because we checked that the index reported that SourceLocationsAvailable was true (so not checking for null
                    // source locations here)
                    var nTermMatch = matchesForSearchTerms[termIndex]
                                     .SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations <TKey>(m))
                                     .FirstOrDefault(m =>
                                                     index.KeyComparer.Equals(m.Key, firstTermMatch.Key) &&
                                                     (m.SourceLocationsIfRecorded.First().SourceFieldIndex == firstTermMatch.SourceLocationsIfRecorded.First().SourceFieldIndex) &&
                                                     (m.SourceLocationsIfRecorded.First().TokenIndex == firstTermMatch.SourceLocationsIfRecorded.First().TokenIndex + termIndex)
                                                     );
                    if (nTermMatch == null)
                    {
                        break;
                    }
                    matchesForEntireTerm = matchesForEntireTerm.Add(nTermMatch);
                }
                if (matchesForEntireTerm.Count < weightAdjustedTokens.Count)
                {
                    // If we didn't manage to get a full set of search terms then this isn't a full match
                    continue;
                }

                // Combine the WeightedEntry instances that represent a run of individual matches (one for each word in the "source" argument) into a single WeightedEntry that represents
                // the entirety of the search term (each of the matchesForEntireTerm WeightedEntry instances will have only a single Source Location since the match data was split up
                // above by calling BreakWeightedEntryIntoIndividualSourceLocations before trying to find the consecutive matches). See notes above about not checking whether
                // SourceLocationsIfRecorded is null (it shouldn't be because we index.SourceLocationsAvailable at the top of this method)
                var sourceLocationOfFirstTerm         = matchesForEntireTerm.First().SourceLocationsIfRecorded.Single();
                var sourceLocationOfLastTerm          = matchesForEntireTerm.Last().SourceLocationsIfRecorded.Single();
                var matchWeightForConsecutiveRunEntry = weightCombinerForConsecutiveRuns(
                    matchesForEntireTerm.Select(m => m.Weight).ToImmutableList()
                    );
                consecutiveMatches.Add(
                    new WeightedEntry <TKey>(
                        matchesForEntireTerm.First().Key,
                        matchWeightForConsecutiveRunEntry,
                        new NonNullImmutableList <SourceFieldLocation>(new[]
                {
                    // Since we're creating a new SourceFieldLocation instance that is derived from a run of multiple tokens, the TokenIndex is going to be an approximation -
                    // taking the TokenIndex from the first search term probably makes the most sense. The SourceIndex and SourceTokenLength will be taken such that the entire
                    // run is covered (from the start of the first search term to the end of the last). Since this is the only Source Location instance for the WeightedEntry,
                    // its MatchWeightContribution value is equal to the WeightedEntry's Weight.
                    new SourceFieldLocation(
                        sourceLocationOfFirstTerm.SourceFieldIndex,
                        sourceLocationOfFirstTerm.TokenIndex,
                        sourceLocationOfFirstTerm.SourceIndex,
                        (sourceLocationOfLastTerm.SourceIndex + sourceLocationOfLastTerm.SourceTokenLength) - sourceLocationOfFirstTerm.SourceIndex,
                        matchWeightForConsecutiveRunEntry
                        )
                })
                        )
                    );
            }

            // The matches need grouping by key before returning
            return(consecutiveMatches
                   .GroupBy(m => m.Key, index.KeyComparer)
                   .Cast <IEnumerable <WeightedEntry <TKey> > >()
                   .Select(matches => new WeightedEntry <TKey>(
                               matches.First().Key,
                               weightCombinerForFinalMatches(
                                   matches.Select(match => match.Weight).ToImmutableList()
                                   ),
                               matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList()
                               ))
                   .ToNonNullImmutableList());
        }
Esempio n. 7
0
        /// <summary>
        /// This will never return null nor contain any null entries
        /// </summary>
        public async Task <NonNullImmutableList <Post> > Get()
        {
            // The redirects set contains tuples From, To slugs (blank lines and those starting with a "#" are ignored, as are any that don't have any whitespace)
            const string redirectsFilename = "Redirects.txt";
            var          redirectsFile     = _folder.FirstOrDefault(file => file.Name.Equals(redirectsFilename, StringComparison.OrdinalIgnoreCase));
            IEnumerable <Tuple <string, string> > redirects;

            if (redirectsFile == null)
            {
                redirects = new List <Tuple <string, string> >();
            }
            else
            {
                redirects = (await ReadFileContents(redirectsFile))
                            .Replace("\r\n", "\n")
                            .Replace("\r", "\n")
                            .Split('\n')
                            .Select(entry => entry.Trim())
                            .Where(entry => (entry != "") && !entry.StartsWith("#") && entry.Any(c => char.IsWhiteSpace(c)))
                            .Select(entry => new string(entry.Select(c => char.IsWhiteSpace(c) ? ' ' : c).ToArray()))
                            .Select(entry => entry.Split(new[] { ' ' }, 2))
                            .Select(values => Tuple.Create(values[0], values[1]));
            }

            // The relatedPostRelationships set contains a map of Post Id to Ids of related Posts (in the order that they should appear)
            const string relatedPostsFilename     = "RelatedPosts.txt";
            var          relatedPostsFile         = _folder.FirstOrDefault(file => file.Name.Equals(relatedPostsFilename, StringComparison.OrdinalIgnoreCase));
            var          relatedPostRelationships = (relatedPostsFile == null)
                                ? new Dictionary <int, ImmutableList <int> >()
                                : await ReadRedirects(relatedPostsFile);

            // There is similar data in the AutoSuggestedRelatedPosts.txt file but the manually-created RelatedPosts.txt should take precedence in cases
            // where Post Ids appear in both
            const string autoSuggestedRelatedPostsFilename     = "AutoSuggestedRelatedPosts.txt";
            var          autoSuggestedRelatedPostsFile         = _folder.FirstOrDefault(file => file.Name.Equals(autoSuggestedRelatedPostsFilename, StringComparison.OrdinalIgnoreCase));
            var          autoSuggestedRelatedPostRelationships = (autoSuggestedRelatedPostsFile == null)
                                ? new Dictionary <int, ImmutableList <int> >()
                                : await ReadRedirects(autoSuggestedRelatedPostsFile);

            // We can use this functionality from the FullTextIndexer to generate the Post slug (it will replace accented characters, normalise whitespace,
            // remove punctuation and lower case the content - all we need to do then is replace spaces with hypens)
            var stringNormaliser = DefaultStringNormaliser.Instance;
            var posts            = new List <Post>();

            foreach (var file in _folder.Where(file => file.Name.EndsWith(".txt", StringComparison.OrdinalIgnoreCase)))
            {
                if (file.Name.Equals(redirectsFilename, StringComparison.InvariantCultureIgnoreCase))
                {
                    continue;
                }

                var fileSummary = TryToGetFileSummaryEntry(file.Name);
                if (fileSummary != null)
                {
                    var fileContents = await ReadFileContents(file);

                    var title = TryToGetTitle(fileContents);
                    if (title != null)
                    {
                        // 2014-09-17 DWR: Titles such as "C# State Machines" were being converted into "c-state-machines" which isn't as descriptive as
                        // I'd like, "c-sharp-state-machines" is better. The replacement is done for "C#" and "F#" (a space is required after the
                        // replacement content otherwise the "sharp" gets rolled into the next word)
                        var slugBase         = title.Replace("C#", "c sharp ").Replace("F#", "f sharp ");
                        var slug             = stringNormaliser.GetNormalisedString(slugBase).Replace(' ', '-');
                        var redirectsForPost = new NonNullOrEmptyStringList(
                            redirects.Where(r => r.Item2 == slug).Select(r => r.Item1)
                            );

                        // On this pass, set every tag's NumberOfPosts value to one since we don't have enough data to know better. After all of the
                        // posts have been loaded, this can be fixed before the method terminates.
                        if (!relatedPostRelationships.TryGetValue(fileSummary.Id, out var relatedPosts))
                        {
                            relatedPosts = null;
                        }
                        if ((relatedPosts != null) || !autoSuggestedRelatedPostRelationships.TryGetValue(fileSummary.Id, out var autoSuggestedRelatedPosts))
                        {
                            // Only check the autoSuggestedRelatedPostRelationships if there was no relatedPostRelationships entry - this allows for posts
                            // to be specified as having no suggestions (manually-specified or auto-suggested) by having an entry in the manually-specified
                            // file that has the post id but zero suggestions.
                            autoSuggestedRelatedPosts = null;
                        }
                        posts.Add(new Post(
                                      fileSummary.Id,
                                      fileSummary.PostDate,
                                      file.LastModified.DateTime,
                                      slug,
                                      redirectsForPost,
                                      title,
                                      fileSummary.IsHighlight,
                                      fileContents,
                                      relatedPosts ?? new ImmutableList <int>(),
                                      autoSuggestedRelatedPosts ?? new ImmutableList <int>(),
                                      fileSummary.Tags.Select(tag => new TagSummary(tag, 1)).ToNonNullImmutableList()
                                      ));
                    }
                }
            }

            var tagCounts = posts
                            .SelectMany(post => post.Tags)
                            .Select(tag => tag.Tag)
                            .GroupBy(tag => tag, StringComparer.OrdinalIgnoreCase)
                            .ToDictionary(groupedTag => groupedTag.Key, groupedTag => groupedTag.Count(), StringComparer.OrdinalIgnoreCase);

            return(new NonNullImmutableList <Post>(posts.Select(post =>
                                                                new Post(
                                                                    post.Id,
                                                                    post.Posted,
                                                                    post.LastModified,
                                                                    post.Slug,
                                                                    post.RedirectFromSlugs,
                                                                    post.Title,
                                                                    post.IsHighlight,
                                                                    post.MarkdownContent,
                                                                    post.RelatedPosts,
                                                                    post.AutoSuggestedRelatedPosts,
                                                                    post.Tags.Select(tag => new TagSummary(tag.Tag, tagCounts[tag.Tag])).ToNonNullImmutableList()
                                                                    )
                                                                )));
        }
Esempio n. 8
0
 public PostIndexContent(IIndexData <int> searchIndex, NonNullOrEmptyStringList autoCompleteContent)
 {
     _searchIndex        = searchIndex ?? throw new ArgumentNullException(nameof(searchIndex));
     AutoCompleteContent = autoCompleteContent ?? throw new ArgumentNullException(nameof(autoCompleteContent));
 }
Esempio n. 9
0
        /// <summary>
        /// This will break a given source string and return results based upon the combination of partial matches (so results that only match part of the source string may be included
        /// in the returned data). The token breaker and the match combiner must be specified by the caller - if the match combiner returns zero then the result will not be included in
        /// the final data. To require that all tokens in the source content be present for any returned results, the following matchCombiner could be specified:
        ///  (tokenMatches, allTokens) => (tokenMatches.Count &lt; allTokens.Count) ? 0 : tokenMatches.Sum(m => m.Weight)
        /// </summary>
        public static NonNullImmutableList <WeightedEntry <TKey> > GetPartialMatches <TKey>(
            this IIndexData <TKey> index,
            string source,
            ITokenBreaker tokenBreaker,
            WeightCombiner weightCombiner)
        {
            if (index == null)
            {
                throw new ArgumentNullException("index");
            }
            if (source == null)
            {
                throw new ArgumentNullException("source");
            }
            if (tokenBreaker == null)
            {
                throw new ArgumentNullException("tokenBreaker");
            }
            if (weightCombiner == null)
            {
                throw new ArgumentNullException("weightCombiner");
            }

            // Break down the "source" search term and find matches for each token
            // - Each match maintains the weight multiplier applied to the string segment from the token breaker
            // - The Source Locations are annotated with additional data; the source segment string and what token index that is (so if the "source" value is broken into three, then
            //   each Source Location will have a SearchTerm property whose TokenIndex will be between 0 and 2, inclusive). This allows for a weightCombiner to be specified that
            //   ensures that every token that was extract from the source value can be matched against a given result, if so desired.
            var matches = new List <Tuple <WeightedEntry <TKey>, SearchTermDetails> >();
            var weightAdjustedTokens = tokenBreaker.Break(source);

            for (var tokenIndex = 0; tokenIndex < weightAdjustedTokens.Count; tokenIndex++)
            {
                var weightAdjustedToken = weightAdjustedTokens[tokenIndex];
                matches.AddRange(
                    index
                    .GetMatches(weightAdjustedToken.Token)
                    .Select(match => Tuple.Create(match, new SearchTermDetails(tokenIndex, weightAdjustedToken.Token)))
                    );
            }

            // Combine per-search-term results, grouping by result key and calculating the match weight for each token using the specified weightCombiner (this may also be
            // used to filter out results; if a match weight of zero is returned then the match will be ignored - this may used to filter out results that only match two
            // out of three of the search terms, for example)
            var finalResults = NonNullImmutableList <WeightedEntry <TKey> > .Empty;
            var searchTerms  = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token));

            foreach (var matchesGroupedByKey in matches.GroupBy(m => m.Item1.Key, index.KeyComparer).Cast <IEnumerable <Tuple <WeightedEntry <TKey>, SearchTermDetails> > >())
            {
                var combinedWeight = weightCombiner(
                    matchesGroupedByKey
                    .Select(m => new MatchWeightWithSourceFieldLocations(
                                m.Item1.Weight,
                                m.Item2,
                                m.Item1.SourceLocationsIfRecorded
                                )).ToNonNullImmutableList(),
                    searchTerms
                    );
                if (combinedWeight < 0)
                {
                    throw new ArgumentException("weightCombiner returned a negative value - invalid");
                }
                else if (combinedWeight > 0)
                {
                    finalResults = finalResults.Add(
                        new WeightedEntry <TKey>(
                            matchesGroupedByKey.First().Item1.Key,
                            combinedWeight,
                            matchesGroupedByKey.Any(m => m.Item1.SourceLocationsIfRecorded == null)
                                                                ? null
                                                                : matchesGroupedByKey.SelectMany(m => m.Item1.SourceLocationsIfRecorded).ToNonNullImmutableList()
                            )
                        );
                }
            }
            return(finalResults);
        }