/// <summary> /// Initializes a new instance of the <see cref="BaseJobsIndexer" /> class. /// </summary> /// <param name="jobsProvider">The jobs provider.</param> /// <param name="stopWordsRemover">The stop words remover.</param> /// <param name="tagSanitiser">The tag sanitiser.</param> /// <exception cref="System.ArgumentNullException">stopWordsRemover</exception> protected BaseJobsIndexer(IJobsDataProvider jobsProvider, ISearchFilter stopWordsRemover, IHtmlTagSanitiser tagSanitiser) { if (jobsProvider == null) { throw new ArgumentNullException(nameof(jobsProvider)); } if (stopWordsRemover == null) { throw new ArgumentNullException(nameof(stopWordsRemover)); } if (tagSanitiser == null) { throw new ArgumentNullException(nameof(tagSanitiser)); } _jobsProvider = jobsProvider; _stopWordsRemover = stopWordsRemover; _tagSanitiser = tagSanitiser; }
/// <summary> /// Get the text content of an HTML string, but without text used for links /// </summary> /// <param name="text">The text.</param> /// <param name="tagSanitiser">The tag sanitiser.</param> /// <returns></returns> public string TextOutsideLinks(string text, IHtmlTagSanitiser tagSanitiser) { if (String.IsNullOrEmpty(text)) { return(text); } if (tagSanitiser == null) { throw new ArgumentNullException(nameof(tagSanitiser)); } // Remove any links including the link text const string anythingExceptEndAnchor = "((?!</a>).)*"; text = Regex.Replace(text, "<a [^>]*>" + anythingExceptEndAnchor + "</a>", String.Empty); // Remove any other HTML, and what's left is text outside links text = HttpUtility.HtmlDecode(tagSanitiser.StripTags(text)); // Any remaining text is invalid return(text.Trim()); }