/// <summary>
        /// Annotates the given sequence of <see cref="Document"/> objects by adding a <b>_highlight</b> field;
        /// the <b>_highlight</b> field will contain the best matching text fragment from the <see cref="Document"/> 
        /// object's full-text field.
        /// </summary>
        /// <param name="hits">The sequence of <see cref="Document"/> objects.</param>
        /// <param name="criteria">The search criteria that produced the hits.</param>
        /// <returns>
        /// The original sequence of Document objects, with a <b>_highlight</b> field added to each Document.
        /// </returns>
        public static IEnumerable<Document> GenerateHighlights(this IEnumerable<Document> hits, SearchCriteria criteria)
        {
            if (hits == null)
                throw new ArgumentNullException(nameof(hits));
            if (criteria == null)
                throw new ArgumentNullException(nameof(criteria));
            if (String.IsNullOrWhiteSpace(criteria.Query))
                throw new ArgumentException("SearchCriteria.Query cannot be empty");

            var documents = hits.ToList();
            try
            {
                var indexDirectory = new RAMDirectory();
                var analyzer = new FullTextAnalyzer();
                var config = new IndexWriterConfig(analyzer).SetRAMBufferSizeMB(_ramBufferSizeMB);
                var writer = new IndexWriter(indexDirectory, config);

                BuidIndex(documents, writer);
                GenerateHighlights(documents, writer, criteria);

                writer.DeleteAll();
                writer.Commit();
                writer.Close();
                indexDirectory.Close();
            }
            catch (Exception ex)
            {
                _log.Error(ex);
            }

            return documents;
        }
Exemplo n.º 2
0
        /// <summary>
        /// Breaks up the input text into individual tokens.
        /// </summary>
        /// <param name="text">The input text.</param>
        /// <param name="enableStemming">if set to <c>true</c>, the FullTextIndex will stem 
        /// the tokens that make up the texts, using the Porter stemming algorithm.</param>
        /// <param name="ignoreCase">if set to <c>true</c>, character casing is ignored.</param>
        /// <param name="separatorChars">A string whose component characters will be used to split the texts into tokens.</param> 
        /// <returns></returns>
        public static IEnumerable<string> Tokenize(string text, bool enableStemming = true, bool ignoreCase = true, string separatorChars = DEFAULT_SEPARATOR_CHARS)
        {
            if (String.IsNullOrWhiteSpace(text))
                throw new ArgumentException("text cannot be null or blank");
            if (String.IsNullOrWhiteSpace(separatorChars))
                separatorChars = DEFAULT_SEPARATOR_CHARS;

            using (var analyzer = new FullTextAnalyzer(enableStemming, ignoreCase, separatorChars))
            {
                using (var stream = analyzer.TokenStream("text", text))
                {
                    var attrib = stream.AddAttribute(typeof(CharTermAttribute)) as CharTermAttribute;
                    stream.Reset();
                    while (stream.IncrementToken())
                    {
                        yield return attrib.ToString();
                    }
                    stream.End();
                }
            }
        }