private static void IndexItem(HtmlContent content) { bool disableSearchIndex = ConfigHelper.GetBoolProperty("DisableSearchIndex", false); if (disableSearchIndex) { return; } //SiteSettings siteSettings // = CacheHelper.GetCurrentSiteSettings(); //if ( // (content == null) // || (siteSettings == null) // ) //{ // return; //} Guid htmlFeatureGuid = new Guid("113FB01C-6408-4607-B0F7-1379E2512396"); ModuleDefinition htmlFeature = new ModuleDefinition(htmlFeatureGuid); Module module = new Module(content.ModuleId); // get list of pages where this module is published List <PageModule> pageModules = PageModule.GetPageModulesByModule(content.ModuleId); foreach (PageModule pageModule in pageModules) { PageSettings pageSettings = new PageSettings( content.SiteId, pageModule.PageId); //don't index pending/unpublished pages if (pageSettings.IsPending) { continue; } IndexItem indexItem = new IndexItem(); if (content.SearchIndexPath.Length > 0) { indexItem.IndexPath = content.SearchIndexPath; } indexItem.SiteId = content.SiteId; indexItem.PageId = pageModule.PageId; indexItem.PageName = pageSettings.PageName; indexItem.ViewRoles = pageSettings.AuthorizedRoles; indexItem.ModuleViewRoles = module.ViewRoles; if (pageSettings.UseUrl) { indexItem.ViewPage = pageSettings.Url.Replace("~/", string.Empty); indexItem.UseQueryStringParams = false; } // generally we should not include the page meta because it can result in duplicate results // one for each instance of html content on the page because they all use the smae page meta. // since page meta should reflect the content of the page it is sufficient to just index the content if ((ConfigurationManager.AppSettings["IndexPageMeta"] != null) && (ConfigurationManager.AppSettings["IndexPageMeta"] == "true")) { indexItem.PageMetaDescription = pageSettings.PageMetaDescription; indexItem.PageMetaKeywords = pageSettings.PageMetaKeyWords; } indexItem.FeatureId = htmlFeatureGuid.ToString(); indexItem.FeatureName = htmlFeature.FeatureName; indexItem.FeatureResourceFile = htmlFeature.ResourceFile; indexItem.ItemId = content.ItemId; indexItem.ModuleId = content.ModuleId; indexItem.ModuleTitle = module.ModuleTitle; indexItem.Title = content.Title; indexItem.Content = SecurityHelper.RemoveMarkup(content.Body); indexItem.PublishBeginDate = pageModule.PublishBeginDate; indexItem.PublishEndDate = pageModule.PublishEndDate; IndexHelper.RebuildIndex(indexItem); } log.Debug("Indexed " + content.Title); }
public override void RebuildIndex( PageSettings pageSettings, string indexPath) { bool disableSearchIndex = ConfigHelper.GetBoolProperty("DisableSearchIndex", false); if (disableSearchIndex) { return; } if (pageSettings == null) { log.Error("pageSettings passed in to HtmlContentIndexBuilderProvider.RebuildIndex was null"); return; } //don't index pending/unpublished pages if (pageSettings.IsPending) { return; } log.Info("HtmlContentIndexBuilderProvider indexing page - " + pageSettings.PageName); try { Guid htmlFeatureGuid = new Guid("113FB01C-6408-4607-B0F7-1379E2512396"); ModuleDefinition htmlFeature = new ModuleDefinition(htmlFeatureGuid); List <PageModule> pageModules = PageModule.GetPageModulesByPage(pageSettings.PageId); HtmlRepository repository = new HtmlRepository(); DataTable dataTable = repository.GetHtmlContentByPage( pageSettings.SiteId, pageSettings.PageId); foreach (DataRow row in dataTable.Rows) { IndexItem indexItem = new IndexItem(); indexItem.SiteId = pageSettings.SiteId; indexItem.PageId = pageSettings.PageId; indexItem.PageName = pageSettings.PageName; // generally we should not include the page meta because it can result in duplicate results // one for each instance of html content on the page because they all use the smae page meta. // since page meta should reflect the content of the page it is sufficient to just index the content if ((ConfigurationManager.AppSettings["IndexPageMeta"] != null) && (ConfigurationManager.AppSettings["IndexPageMeta"] == "true")) { indexItem.PageMetaDescription = pageSettings.PageMetaDescription; indexItem.PageMetaKeywords = pageSettings.PageMetaKeyWords; } indexItem.ViewRoles = pageSettings.AuthorizedRoles; indexItem.ModuleViewRoles = row["ViewRoles"].ToString(); if (pageSettings.UseUrl) { indexItem.ViewPage = pageSettings.Url.Replace("~/", string.Empty); indexItem.UseQueryStringParams = false; } indexItem.FeatureId = htmlFeatureGuid.ToString(); indexItem.FeatureName = htmlFeature.FeatureName; indexItem.FeatureResourceFile = htmlFeature.ResourceFile; indexItem.ItemId = Convert.ToInt32(row["ItemID"]); indexItem.ModuleId = Convert.ToInt32(row["ModuleID"]); indexItem.ModuleTitle = row["ModuleTitle"].ToString(); indexItem.Title = row["Title"].ToString(); // added the remove markup 2010-01-30 because some javascript strings like ]]> were apearing in search results if the content conatined jacvascript indexItem.Content = SecurityHelper.RemoveMarkup(row["Body"].ToString()); // lookup publish dates foreach (PageModule pageModule in pageModules) { if (indexItem.ModuleId == pageModule.ModuleId) { indexItem.PublishBeginDate = pageModule.PublishBeginDate; indexItem.PublishEndDate = pageModule.PublishEndDate; } } IndexHelper.RebuildIndex(indexItem, indexPath); log.Debug("Indexed " + indexItem.Title); } } catch (System.Data.Common.DbException ex) { log.Error(ex); } }
private Document GetDocument(IndexItem indexItem) { Document doc = new Document(); // searchable fields doc.Add(new Field("Key", indexItem.Key, Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("SiteID", indexItem.SiteId.ToString(CultureInfo.InvariantCulture), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("ViewRoles", indexItem.ViewRoles, Field.Store.YES, Field.Index.NO)); string[] roles = indexItem.ViewRoles.Split(';'); foreach (string role in roles) { if (role.Length > 0) { doc.Add(new Field("Role", role, Field.Store.YES, Field.Index.UN_TOKENIZED)); } } roles = indexItem.ModuleViewRoles.Split(';'); foreach (string role in roles) { if (role.Length > 0) { doc.Add(new Field("ModuleRole", role, Field.Store.YES, Field.Index.UN_TOKENIZED)); } } doc.Add(new Field("FeatureId", indexItem.FeatureId, Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("PageID", indexItem.PageId.ToString(CultureInfo.InvariantCulture), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("ModuleID", indexItem.ModuleId.ToString(CultureInfo.InvariantCulture), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("ItemID", indexItem.ItemId.ToString(CultureInfo.InvariantCulture), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("PublishBeginDate", indexItem.PublishBeginDate.ToString("s"), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("PublishEndDate", indexItem.PublishEndDate.ToString("s"), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("IndexedUtc", DateTime.UtcNow.ToString("s"), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("PageName", indexItem.PageName, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.Add(new Field("ModuleTitle", indexItem.ModuleTitle, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.Add(new Field("Title", indexItem.Title, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.Add(new Field("PageMetaDesc", indexItem.PageMetaDescription, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); string[] keywords = indexItem.PageMetaKeywords.Split(','); foreach (string word in keywords) { if (word.Trim().Length > 0) { doc.Add(new Field("Keyword", word.Trim(), Field.Store.YES, Field.Index.UN_TOKENIZED)); } } string textContent = ConvertToText(indexItem.Content); doc.Add(new Field("Intro", (textContent.Length < 100 ? textContent : (UIHelper.CreateExcerpt(textContent, 97) + "...")) , Field.Store.YES, Field.Index.UN_TOKENIZED ) ); // other content is optional, used for blog comments // could be used elsewhere if (storeContentForResultsHighlighting) { doc.Add(new Field("contents", textContent + " " + ConvertToText(indexItem.OtherContent), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); } else { doc.Add(new Field("contents", textContent + " " + ConvertToText(indexItem.OtherContent), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); } //unsearchable fields doc.Add(new Field("Feature", indexItem.FeatureName, Field.Store.YES, Field.Index.NO)); doc.Add(new Field("FeatureResourceFile", indexItem.FeatureResourceFile, Field.Store.YES, Field.Index.NO)); doc.Add(new Field("PageNumber", indexItem.PageNumber.ToString(CultureInfo.InvariantCulture), Field.Store.YES, Field.Index.NO)); doc.Add(new Field("ViewPage", indexItem.ViewPage, Field.Store.YES, Field.Index.NO)); doc.Add(new Field("UseQueryStringParams", indexItem.UseQueryStringParams.ToString(), Field.Store.YES, Field.Index.NO)); doc.Add(new Field("QueryStringAddendum", indexItem.QueryStringAddendum, Field.Store.YES, Field.Index.NO)); return(doc); }
private void ProcessQueue(DataTable q, string indexPath) { rowsProcessed = 0; rowsToProcess = q.Rows.Count; // first process deletes with reader try { IndexReader reader = IndexReader.Open(indexPath); foreach (DataRow row in q.Rows) { Term term = new Term("Key", row["ItemKey"].ToString()); try { reader.DeleteDocuments(term); log.Debug("reader.DeleteDocuments(term) for Key " + row["ItemKey"].ToString()); } catch (Exception ge) { // TODO: monitor what real exceptions if any occur and then // change this catch to catch only the expected ones // instead of non specific exception log.Error(ge); } bool removeOnly = Convert.ToBoolean(row["RemoveOnly"]); if (removeOnly) { Int64 rowId = Convert.ToInt64(row["RowId"]); IndexingQueue.Delete(rowId); } if (DateTime.UtcNow > nextStatusUpdateTime) { // don't mark as complete because there may be more qu items //for different index paths in a multi site installation bool markAsComplete = false; ReportStatus(markAsComplete); } } reader.Close(); } catch (IOException ex) { log.Info("IndexWriter swallowed exception this is not unexpected if building or rebuilding the search index ", ex); errorCount += 1; } catch (TypeInitializationException ex) { log.Info("IndexWriter swallowed exception ", ex); errorCount += 1; } // next add items with writer IndexWriter indexWriter = GetWriter(indexPath); if (indexWriter == null) { log.Error("failed to get IndexWriter for path: " + indexPath); errorCount += 1; return; } foreach (DataRow row in q.Rows) { bool removeOnly = Convert.ToBoolean(row["RemoveOnly"]); if (!removeOnly) { try { IndexItem indexItem = (IndexItem)SerializationHelper.DeserializeFromString(typeof(IndexItem), row["SerializedItem"].ToString()); Document doc = GetDocument(indexItem); WriteToIndex(doc, indexWriter); log.Debug("called WriteToIndex(doc, indexWriter) for key " + indexItem.Key); Int64 rowId = Convert.ToInt64(row["RowId"]); IndexingQueue.Delete(rowId); } catch (Exception ex) { log.Error(ex); } } if (DateTime.UtcNow > nextStatusUpdateTime) { // don't mark as complete because there may be more qu items //for different index paths in a multi site installation bool markAsComplete = false; ReportStatus(markAsComplete); } } try { indexWriter.Optimize(); } catch (IOException ex) { log.Error(ex); } try { indexWriter.Close(); } catch (IOException ex) { log.Error(ex); } }
public void Remove(IndexItem item) { this.List.Remove(item); }
public void Add(IndexItem item) { this.List.Add(item); }
/// <summary> /// search support multiple modules /// </summary> /// <param name="siteId"></param> /// <param name="isAdmin"></param> /// <param name="userRoles"></param> /// <param name="queryText"></param> /// <param name="highlightResults"></param> /// <param name="highlightedFragmentSize"></param> /// <param name="pageNumber"></param> /// <param name="pageSize"></param> /// <param name="totalHits"></param> /// <param name="invalidQuery"></param> /// <param name="moduleIDs"></param> /// <returns></returns> public static IndexItemCollection Search( int siteId, bool isAdmin, List <string> userRoles, string queryText, bool highlightResults, int highlightedFragmentSize, int pageNumber, int pageSize, out int totalHits, out bool invalidQuery, params Guid[] moduleIDs ) { invalidQuery = false; totalHits = 0; string indexPath = GetIndexPath(siteId); IndexItemCollection results = new IndexItemCollection(); if (string.IsNullOrEmpty(queryText)) { return(results); } bool useBackwardCompatibilityMode = true; if ( (ConfigurationManager.AppSettings["SearchUseBackwardCompatibilityMode"] != null) && (ConfigurationManager.AppSettings["SearchUseBackwardCompatibilityMode"] == "false") ) { useBackwardCompatibilityMode = false; } bool IncludeModuleRoleFilters = false; if ( (ConfigurationManager.AppSettings["SearchIncludeModuleRoleFilters"] != null) && (ConfigurationManager.AppSettings["SearchIncludeModuleRoleFilters"] == "true") ) { IncludeModuleRoleFilters = true; } if (IndexReader.IndexExists(indexPath)) { if (log.IsDebugEnabled) { log.Debug("Entered Search, indexPath = " + indexPath); } long startTicks = DateTime.Now.Ticks; try { BooleanQuery mainQuery = new BooleanQuery(); if ((!isAdmin) && (!useBackwardCompatibilityMode)) { AddRoleQueries(userRoles, mainQuery); } if ((!isAdmin) && (IncludeModuleRoleFilters)) { AddModuleRoleQueries(userRoles, mainQuery); } Query multiQuery = MultiFieldQueryParser.Parse( new string[] { queryText, queryText, queryText, queryText, queryText, queryText.Replace("*", string.Empty) }, new string[] { "Title", "ModuleTitle", "contents", "PageName", "PageMetaDesc", "Keyword" }, new StandardAnalyzer()); mainQuery.Add(multiQuery, BooleanClause.Occur.MUST); if (!useBackwardCompatibilityMode) { Term beginDateStart = new Term("PublishBeginDate", DateTime.MinValue.ToString("s")); Term beginDateEnd = new Term("PublishBeginDate", DateTime.UtcNow.ToString("s")); RangeQuery beginDateQuery = new RangeQuery(beginDateStart, beginDateEnd, true); mainQuery.Add(beginDateQuery, BooleanClause.Occur.MUST); Term endDateStart = new Term("PublishEndDate", DateTime.UtcNow.ToString("s")); Term endDateEnd = new Term("PublishEndDate", DateTime.MaxValue.ToString("s")); RangeQuery endDateQuery = new RangeQuery(endDateStart, endDateEnd, true); mainQuery.Add(endDateQuery, BooleanClause.Occur.MUST); } if (moduleIDs != null && moduleIDs.Length > 0) { BooleanQuery featureFilter = new BooleanQuery(); moduleIDs.ToList().ForEach(x => { if (x != Guid.Empty) { featureFilter.Add(new TermQuery(new Term("FeatureId", x.ToString())), BooleanClause.Occur.SHOULD); } }); if (featureFilter.Clauses().Count > 0) { mainQuery.Add(featureFilter, BooleanClause.Occur.MUST); } } IndexSearcher searcher = new IndexSearcher(indexPath); // a 0 based colection Hits hits = searcher.Search(mainQuery); int startHit = 0; if (pageNumber > 1) { startHit = ((pageNumber - 1) * pageSize); } totalHits = hits.Length(); int end = startHit + pageSize; if (totalHits <= end) { end = totalHits; } int itemsAdded = 0; int itemsToAdd = end; // in backward compatibility mode if multiple pages of results are found we amy not be showing every user the correct // number of hits they can see as we only filter out the current page //we may decrement total hits if filtering results so keep the original count int actualHits = totalHits; if (!useBackwardCompatibilityMode) { // this new way is much cleaner //all filtering is done by query so the hitcount is true //whereas with the old way it could be wrong since there // were possibly results filtered out after the query returned. QueryScorer scorer = new QueryScorer(multiQuery); Formatter formatter = new SimpleHTMLFormatter("<span class='searchterm'>", "</span>"); Highlighter highlighter = new Highlighter(formatter, scorer); highlighter.SetTextFragmenter(new SimpleFragmenter(highlightedFragmentSize)); for (int i = startHit; i < itemsToAdd; i++) { IndexItem indexItem = new IndexItem(hits.Doc(i), hits.Score(i)); if (highlightResults) { try { TokenStream stream = new StandardAnalyzer().TokenStream("contents", new StringReader(hits.Doc(i).Get("contents"))); string highlightedResult = highlighter.GetBestFragment(stream, hits.Doc(i).Get("contents")); if (highlightedResult != null) { indexItem.Intro = highlightedResult; } } catch (NullReferenceException) { } } results.Add(indexItem); itemsAdded += 1; } } else { //backward compatible with old indexes int filteredItems = 0; for (int i = startHit; i < itemsToAdd; i++) { bool needToDecrementTotalHits = false; if ( (isAdmin) || (WebUser.IsContentAdmin) || (WebUser.IsInRoles(hits.Doc(i).Get("ViewRoles"))) ) { IndexItem indexItem = new IndexItem(hits.Doc(i), hits.Score(i)); if ( (DateTime.UtcNow > indexItem.PublishBeginDate) && (DateTime.UtcNow < indexItem.PublishEndDate) ) { results.Add(indexItem); } else { needToDecrementTotalHits = true; } } else { needToDecrementTotalHits = true; } //filtered out a result so need to decrement if (needToDecrementTotalHits) { filteredItems += 1; totalHits -= 1; //we also are not getting as many results as the page size so if there are more items //we should increment itemsToAdd if ((itemsAdded + filteredItems) < actualHits) { itemsToAdd += 1; } } } } searcher.Close(); results.ItemCount = itemsAdded; results.PageIndex = pageNumber; results.ExecutionTime = DateTime.Now.Ticks - startTicks; } catch (ParseException ex) { invalidQuery = true; log.Error("handled error for search terms " + queryText, ex); // these parser exceptions are generally caused by // spambots posting too much junk into the search form // heres an option to automatically ban the ip address HandleSpam(queryText, ex); return(results); } catch (BooleanQuery.TooManyClauses ex) { invalidQuery = true; log.Error("handled error for search terms " + queryText, ex); return(results); } } return(results); }