private void Insert(IndexRecord indexRecord) { IndexWriter writer = Writer; if (writer == null) { return; } try { Document doc = new Document(); doc.Add(new Field(IndexRecord.URIFIELD, indexRecord.Uri, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(IndexRecord.ENTITYFIELD, indexRecord.Entity, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(IndexRecord.CONTENTFIELD, new StringReader(IndexRecord.ProcessContent(indexRecord.Content)))); #pragma warning disable CS0618 // Type or member is obsolete doc.Add(new Field(IndexRecord.TIMESTAMPFIELD, DateField.DateToString(DateTime.Now), Field.Store.YES, Field.Index.NO)); #pragma warning restore CS0618 // Type or member is obsolete doc.Add(new Field(IndexRecord.VIEWERFIELD, indexRecord.Viewer, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(IndexRecord.TITLEFIELD, indexRecord.Title, Field.Store.YES, Field.Index.NOT_ANALYZED)); int i = 1; foreach (string key in indexRecord.Keys) { doc.Add(new Field(string.Format("{0}{1}", IndexRecord.KEYFIELDPREFIX, (i++).ToString()), key, Field.Store.YES, Field.Index.NO)); } GXLogging.Debug(log, "AddDocument:" + indexRecord.Uri + " content:" + indexRecord.Content); writer.AddDocument(doc, m_analyzer); if (m_counter++ > Settings.Instance.OptimizeThreshold) { m_counter = 0; GXLogging.Warn(log, "Optimizing index"); writer.Optimize(); } } catch (Exception e) { GXLogging.Error(log, "Insert error", e); } finally { try { writer.Dispose(); Searcher.Instance.Close(); } catch (Exception ex) { GXLogging.Error(log, "Close writer error", ex); } } }
/// <summary> /// Loops thro a list of stories and adds them to the index. If the crawl is an incremental /// update then first the story is removed then added again. /// </summary> /// <param name="modifier">IndexModifer used to update the index</param> /// <param name="isIncrementalCrawl">bool indicating if the stories should /// be removed from the existing index before being added again.</param> /// <param name="stories">StoryCollection containing the stories to add/update /// in the index</param> private void AddStoriesToIndex(IndexModifier modifier, bool isIncrementalCrawl, StoryCollection stories) { if (isIncrementalCrawl) { //remove the stories from the index that have been updated Log.DebugFormat("Updating index, removing {0} stories", stories.Count); foreach (Story s in stories) { Term existingItem = new Term("id", s.StoryID.ToString()); int j = modifier.DeleteDocuments(existingItem); } } //add the new documents Log.DebugFormat("Adding batch of {0} stories to the index", stories.Count); foreach (Story story in stories) { //spam stories shouldnt be added to the index if (story.IsSpam) { continue; } Document doc = new Document(); doc.Add(new Field("url", story.Url, Field.Store.NO, Field.Index.TOKENIZED)); doc.Add(new Field("title", story.Title, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.Add(new Field("description", story.Description, Field.Store.NO, Field.Index.TOKENIZED)); doc.Add(new Field("users", GetUserWhoKickedSearchString(story), Field.Store.NO, Field.Index.TOKENIZED)); doc.Add(new Field("category", story.Category.Name, Field.Store.NO, Field.Index.TOKENIZED)); doc.Add(new Field("tags", GetStoryTags(story), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.Add(new Field("id", story.StoryID.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("kickCount", story.KickCount.ToString(), Field.Store.NO, Field.Index.UN_TOKENIZED)); doc.Add(new Field("dateAdded", DateField.DateToString(story.CreatedOn), Field.Store.NO, Field.Index.UN_TOKENIZED)); modifier.AddDocument(doc); Log.DebugFormat("StoryId {0} added to index", story.StoryID); } }
/// <summary> /// 创建索引档 /// </summary> /// <param name="id">文档ID号</param> /// <param name="author">作者</param> /// <param name="cat">文章类别(大类ID)</param> /// <param name="title">文章标题</param> /// <param name="body">文章正文</param> /// <param name="tag">标签</param> /// <param name="path">文档路径</param> /// <returns></returns> public static Lucene.Net.Documents.Document CreateDocument(string id, string author, string cat, string title, string body, string tag, string path) { Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(new Field("id", id, Field.Store.YES, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("author", author, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("cat", cat, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("title", title, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("body", body, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS)); doc.Add(new Field("tag", tag, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("path", path, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("date", DateField.DateToString(DateTime.Now), Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); //设置权重,越靠后的文章权重越大,在搜索结果中的位置靠前的机会就越大 float boost = Single.Parse(DateTime.Now.ToString("0.yyyyMMddhh")); doc.SetBoost(boost); //确定保存文档压缩包的路径 string fpath = Directorys.StoreDirectory + Math.Ceiling(Double.Parse(id) / 10000D).ToString("f0"); if (!System.IO.Directory.Exists(fpath)) { System.IO.Directory.CreateDirectory(fpath); } //将文档以gzip方式保存到相应位置 StoreWriter store = new StoreWriter(fpath + @"\" + id + ".gz"); store.WriteLine(author); store.WriteLine(cat); store.WriteLine(tag); store.WriteLine(title); store.WriteLine(path); store.WriteLine(body); store.Close(); return(doc); }