private void CreateIndex2() { IndexWriter iw = null; iw = new IndexWriter("D:\\lucene", anay, true); DataTable dt = SqlHelper2.QueryTable("select a_id, b_name,u_nickname,a_title,a_content,b_id from v_article"); foreach (DataRow dr in dt.Rows) { Document doc = new Document(); string title = dr["a_title"].ToString(); string content = dr["a_content"].ToString(); string nickname = dr["u_nickname"].ToString(); string bname = dr["b_name"].ToString(); string bid = dr["b_id"].ToString(); string aid = dr["a_id"].ToString(); if (aid == "5938") { doc.SetBoost(100); } doc.Add(Field.Keyword("title", title)); doc.Add(Field.Keyword("content", content)); doc.Add(Field.Keyword("nick", nickname)); doc.Add(Field.Text("bname", bname)); doc.Add(Field.Keyword("bid", bid)); doc.Add(Field.Keyword("aid", aid)); iw.AddDocument(doc); } iw.Optimize(); iw.Close(); Response.Write("<script>alert('建立索引完成!');</script>"); }
public virtual void TestDocBoost_Renamed_Method() { RAMDirectory store = new RAMDirectory(); IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true); Fieldable f1 = new Field("field", "word", Field.Store.YES, Field.Index.TOKENIZED); Fieldable f2 = new Field("field", "word", Field.Store.YES, Field.Index.TOKENIZED); f2.SetBoost(2.0f); Lucene.Net.Documents.Document d1 = new Lucene.Net.Documents.Document(); Lucene.Net.Documents.Document d2 = new Lucene.Net.Documents.Document(); Lucene.Net.Documents.Document d3 = new Lucene.Net.Documents.Document(); Lucene.Net.Documents.Document d4 = new Lucene.Net.Documents.Document(); d3.SetBoost(3.0f); d4.SetBoost(2.0f); d1.Add(f1); // boost = 1 d2.Add(f2); // boost = 2 d3.Add(f1); // boost = 3 d4.Add(f2); // boost = 4 writer.AddDocument(d1); writer.AddDocument(d2); writer.AddDocument(d3); writer.AddDocument(d4); writer.Optimize(); writer.Close(); float[] scores = new float[4]; new IndexSearcher(store).Search(new TermQuery(new Term("field", "word")), new AnonymousClassHitCollector(scores, this)); float lastScore = 0.0f; for (int i = 0; i < 4; i++) { Assert.IsTrue(scores[i] > lastScore); lastScore = scores[i]; } }
/// <summary> /// 创建索引档 /// </summary> /// <param name="id">文档ID号</param> /// <param name="author">作者</param> /// <param name="cat">文章类别(大类ID)</param> /// <param name="title">文章标题</param> /// <param name="body">文章正文</param> /// <param name="tag">标签</param> /// <param name="path">文档路径</param> /// <returns></returns> public static Lucene.Net.Documents.Document CreateDocument(string id, string author, string cat, string title, string body, string tag, string path) { Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(new Field("id", id, Field.Store.YES, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("author", author, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("cat", cat, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("title", title, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("body", body, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS)); doc.Add(new Field("tag", tag, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("path", path, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("date", DateField.DateToString(DateTime.Now), Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); //设置权重,越靠后的文章权重越大,在搜索结果中的位置靠前的机会就越大 float boost = Single.Parse(DateTime.Now.ToString("0.yyyyMMddhh")); doc.SetBoost(boost); //确定保存文档压缩包的路径 string fpath = Directorys.StoreDirectory + Math.Ceiling(Double.Parse(id) / 10000D).ToString("f0"); if (!System.IO.Directory.Exists(fpath)) { System.IO.Directory.CreateDirectory(fpath); } //将文档以gzip方式保存到相应位置 StoreWriter store = new StoreWriter(fpath + @"\" + id + ".gz"); store.WriteLine(author); store.WriteLine(cat); store.WriteLine(tag); store.WriteLine(title); store.WriteLine(path); store.WriteLine(body); store.Close(); return(doc); }
public override void IndexDocuments(AbstractViewGenerator viewGenerator, IEnumerable<object> documents, WorkContext context, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; Write(context, (indexWriter, analyzer, stats) => { var processedKeys = new HashSet<string>(); var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(name)) .Where(x => x != null) .ToList(); var documentsWrapped = documents.Select((dynamic doc) => { if(doc.__document_id == null) throw new ArgumentException(string.Format("Cannot index something which doesn't have a document id, but got: '{0}'", doc)); count++; string documentId = doc.__document_id.ToString(); if (processedKeys.Add(documentId) == false) return doc; batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format("Error when executed OnIndexEntryDeleted trigger for index '{0}', key: '{1}'", name, documentId), exception); context.AddError(name, documentId, exception.Message ); }, trigger => trigger.OnIndexEntryDeleted(documentId)); indexWriter.DeleteDocuments(new Term(Constants.DocumentIdFieldName, documentId.ToLowerInvariant())); return doc; }); var anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(indexDefinition); var luceneDoc = new Document(); var documentIdField = new Field(Constants.DocumentIdFieldName, "dummy", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS); foreach (var doc in RobustEnumerationIndex(documentsWrapped, viewGenerator.MapDefinitions, actions, context, stats)) { count++; float boost; var indexingResult = GetIndexingResult(doc, anonymousObjectToLuceneDocumentConverter, out boost); if (indexingResult.NewDocId != null && indexingResult.ShouldSkip == false) { count += 1; luceneDoc.GetFields().Clear(); luceneDoc.SetBoost(boost); documentIdField.SetValue(indexingResult.NewDocId.ToLowerInvariant()); luceneDoc.Add(documentIdField); foreach (var field in indexingResult.Fields) { luceneDoc.Add(field); } batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format( "Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'", name, indexingResult.NewDocId), exception); context.AddError(name, indexingResult.NewDocId, exception.Message ); }, trigger => trigger.OnIndexEntryCreated(indexingResult.NewDocId, luceneDoc)); LogIndexedDocument(indexingResult.NewDocId, luceneDoc); AddDocumentToIndex(indexWriter, luceneDoc, analyzer); } stats.IndexingSuccesses++; } batchers.ApplyAndIgnoreAllErrors( e => { logIndexing.WarnException("Failed to dispose on index update trigger", e); context.AddError(name, null, e.Message); }, x => x.Dispose()); return count; }); logIndexing.Debug("Indexed {0} documents for {1}", count, name); }
public static void IndexSerial(System.Collections.IDictionary docs, Directory dir) { IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); // index all docs in a single thread System.Collections.IEnumerator iter = docs.Values.GetEnumerator(); while (iter.MoveNext()) { Document d = (Document) iter.Current; System.Collections.ArrayList fields = new System.Collections.ArrayList(); fields.AddRange(d.GetFields()); // put fields in same order each time //{{Lucene.Net-2.9.1}} No, don't change the order of the fields //SupportClass.CollectionsHelper.Sort(fields, fieldNameComparator); Document d1 = new Document(); d1.SetBoost(d.GetBoost()); for (int i = 0; i < fields.Count; i++) { d1.Add((Fieldable) fields[i]); } w.AddDocument(d1); // System.out.println("indexing "+d1); } w.Close(); }
private Document CreateIndexDocuementForTicket(Ticket ticket) { var doc = new Document(); var commentTexts = (from c in ticket.TicketComments select c.Comment); StringBuilder sb = new StringBuilder(); foreach (var c in commentTexts) { sb.AppendLine(c); } var commentText = sb.ToString(); Lucene.Net.Documents.Field idField = new Lucene.Net.Documents.Field ( "ticketid", ticket.TicketId.ToString(), Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NO, Lucene.Net.Documents.Field.TermVector.NO ); Lucene.Net.Documents.Field titleField = new Lucene.Net.Documents.Field ( "title", ticket.Title ?? string.Empty, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES ); titleField.SetBoost(1.5F); Lucene.Net.Documents.Field detailsField = new Lucene.Net.Documents.Field ( "details", ticket.Details ?? string.Empty, Lucene.Net.Documents.Field.Store.NO, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES ); detailsField.SetBoost(1F); Lucene.Net.Documents.Field tagsField = new Lucene.Net.Documents.Field ( "tags", ticket.TagList ?? string.Empty, Lucene.Net.Documents.Field.Store.NO, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.NO ); tagsField.SetBoost(2F); Lucene.Net.Documents.Field commentsField = new Lucene.Net.Documents.Field ( "comments", commentText ?? string.Empty, Lucene.Net.Documents.Field.Store.NO, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES ); commentsField.SetBoost(.8F); doc.Add(idField); doc.Add(titleField); doc.Add(detailsField); doc.Add(tagsField); doc.Add(commentsField); if (ticket.CurrentStatus != "Closed") { doc.SetBoost(10F); } return doc; }
public Document GetDocument(object instance, object id, Type entityType) { Document doc = new Document(); if (rootClassMapping.Boost != null) { doc.SetBoost(rootClassMapping.Boost.Value); } // TODO: Check if that should be an else? { Field classField = new Field(CLASS_FIELDNAME, TypeHelper.LuceneTypeName(entityType), Field.Store.YES, Field.Index.UN_TOKENIZED); doc.Add(classField); idMapping.Bridge.Set(idMapping.Name, id, doc, Field.Store.YES, Field.Index.UN_TOKENIZED, idMapping.Boost); } BuildDocumentFields(instance, doc, rootClassMapping, string.Empty); return doc; }
/// <summary> /// ���������� /// </summary> /// <param name="id">�ĵ�ID��</param> /// <param name="author">����</param> /// <param name="cat">�������(����ID)</param> /// <param name="title">���±���</param> /// <param name="body">��������</param> /// <param name="tag">��ǩ</param> /// <param name="path">�ĵ�·��</param> /// <returns></returns> public static Lucene.Net.Documents.Document CreateDocument(string id, string author, string cat, string title, string body, string tag, string path) { Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(new Field("id", id, Field.Store.YES, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("author", author, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("cat", cat, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("title", title, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("body", body, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS)); doc.Add(new Field("tag", tag, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("path", path, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.Add(new Field("date", DateField.DateToString(DateTime.Now), Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); //����Ȩ�أ�Խ���������Ȩ��Խ������������е�λ�ÿ�ǰ�Ļ����Խ�� float boost = Single.Parse(DateTime.Now.ToString("0.yyyyMMddhh")); doc.SetBoost(boost); //ȷ�������ĵ�ѹ������·�� string fpath = Directorys.StoreDirectory + Math.Ceiling(Double.Parse(id) / 10000D).ToString("f0"); if (!System.IO.Directory.Exists(fpath)) { System.IO.Directory.CreateDirectory(fpath); } //���ĵ���gzip��ʽ���浽��Ӧλ�� StoreWriter store = new StoreWriter(fpath + @"\" + id + ".gz"); store.WriteLine(author); store.WriteLine(cat); store.WriteLine(tag); store.WriteLine(title); store.WriteLine(path); store.WriteLine(body); store.Close(); return doc; }
public Document GetDocument(object instance, object id) { Document doc = new Document(); System.Type instanceClass = instance.GetType(); if (rootPropertiesMetadata.boost != null) { doc.SetBoost(rootPropertiesMetadata.boost.Value); } // TODO: Check if that should be an else? { Field classField = new Field(CLASS_FIELDNAME, TypeHelper.LuceneTypeName(instanceClass), Field.Store.YES, Field.Index.UN_TOKENIZED); doc.Add(classField); idBridge.Set(idKeywordName, id, doc, Field.Store.YES, Field.Index.UN_TOKENIZED, idBoost); } BuildDocumentFields(instance, doc, rootPropertiesMetadata); return doc; }
/// <summary> /// 创建索引 /// </summary> public void CreateIndex() { var doc = new Document(); // 构造一个域信息 /* Field.Store.YES:存储字段值(未分词前的字段值) Field.Store.NO:不存储,存储与索引没有关系 Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损 Field.Index.ANALYZED:分词建索引 Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间 Field.Index.NOT_ANALYZED:不分词且索引 Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存 TermVector表示文档的条目(由一个Document和Field定位)和它们在当前文档中所出现的次数 Field.TermVector.YES:为每个文档(Document)存储该字段的TermVector Field.TermVector.NO:不存储TermVector Field.TermVector.WITH_POSITIONS:存储位置 Field.TermVector.WITH_OFFSETS:存储偏移量 Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量 */ var field1 = new Field("title", "笑傲江湖", Field.Store.YES, Field.Index.ANALYZED); // Field设置权重 field1.SetBoost(1.1f); // 向文档中添加域 doc.Add(field1); // 设置文档的权重(默认权重是1.0) doc.SetBoost(2); this.indexWriter.AddDocument(doc); // 优化索引结构 this.indexWriter.Optimize(); // this.indexWriter.Commit(); // 关闭写入 this.indexWriter.Close(); }
/// <summary> /// This method will index the contents present in the dictionary /// </summary> /// <param name="keyValueDic">Dictionary object holding the key value pairs </param> public void Index(StringDictionary keyValueDic) { Document doc = new Document(); foreach (string key in keyValueDic.Keys) { if (keyValueDic[key] != null) { if (key == "content") { try { if (keyValueDic["type"] == ".rar" || keyValueDic["type"] == ".zip" || keyValueDic["type"] == ".gz" || keyValueDic["type"] == ".bz2" || keyValueDic["type"] == ".tar") pfaw.AddAnalyzer("content", standardAnalyzer); //for archive files v use standard analyzer else pfaw.AddAnalyzer("content", stopAnalyzer); doc.Add(new Field(key, new StreamReader(keyValueDic[key]))); } catch { } } //else if (key == "path") doc.Add(new Field(key, keyValueDic[key], Field.Store.YES, Field.Index.NO)); else if (key == "size") doc.Add(new Field(key, keyValueDic[key].PadLeft(12, '0'), Field.Store.YES, Field.Index.NO_NORMS)); else doc.Add(new Field(key, keyValueDic[key].ToLower(), Field.Store.YES, Field.Index.NO_NORMS)); } } try { if (keyValueDic["attr"].ToLower().Contains("hidden")) doc.SetBoost(.5f); //setting the ranking or boosting factor of the document index.AddDocument(doc); } catch (Exception ex) {/* Console.WriteLine(keyValueDic["path"] + e.Message + " == " + e.StackTrace + " " + e.Source); */} }
private void AddDocument(IndexWriter writer, string title, string url, string site, string body, string publish_time,int boost) { Document document = new Document(); Field ftitle = new Field("title", title, Field.Store.YES, Field.Index.ANALYZED); document.Add(ftitle);//存储,索引 document.Add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));//存储,不索引 document.Add(new Field("site", site, Field.Store.YES, Field.Index.NOT_ANALYZED));//存储,不索引 Field fbody = new Field("body", body, Field.Store.YES, Field.Index.ANALYZED); document.Add(fbody);//存储,索引 document.Add(new Field("publish_time", publish_time, Field.Store.YES, Field.Index.NOT_ANALYZED));//存储,不索引 document.SetBoost(boost); writer.AddDocument(document); }
public static void IndexSerial(System.Collections.IDictionary docs, Directory dir) { IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer()); // index all docs in a single thread System.Collections.IEnumerator iter = docs.Values.GetEnumerator(); while (iter.MoveNext()) { Document d = (Document) iter.Current; System.Collections.ArrayList fields = new System.Collections.ArrayList(); fields.AddRange(d.GetFields()); // nonono - can't do this (below) // // if multiple fields w/ same name, each instance must be // added in the same order as orginal doc, as the fields // are effectively concatendated // // term position/offset information must be maintained // put fields in same order each time //fields.Sort(fieldNameComparator); Document d1 = new Document(); d1.SetBoost(d.GetBoost()); for (int i = 0; i < fields.Count; i++) { d1.Add((Fieldable) fields[i]); } w.AddDocument(d1); // System.out.println("indexing "+d1); } w.Close(); }
private Yield OnQueueExpire(UpdateRecord data, Result result) { _log.DebugFormat("indexing '{0}'", data.Id); XUri docId = data.Id.WithHost("localhost").WithPort(80); string wikiid = data.Id.Host; if(string.IsNullOrEmpty(wikiid)) { wikiid = "default"; } XDoc revision = null; XUri revisionUri = null; XUri channel = data.Meta["channel"].AsUri; string type = channel.Segments[1]; string action = channel.Segments[2]; string contentUri = string.Empty; _log.DebugFormat("processing action '{0}' for resource type '{1}' and id '{2}'", action, type, data.Id); Term deleteTerm; // if this is an Add we need to validate the data before we get to a possible delete string oldDocUri = docId.ToString().ToLowerInvariant(); switch(type) { case "pages": if(oldDocUri.Contains("@api/deki/archive/")) { oldDocUri = oldDocUri.Replace("@api/deki/archive/", "@api/deki/"); } deleteTerm = new Term("uri", oldDocUri); break; case "users": var userId = data.Meta["userid"].AsText; deleteTerm = new Term("id.user", userId); break; default: deleteTerm = new Term("uri", oldDocUri); break; } if(data.ActionStack.IsAdd) { if(data.Meta.IsEmpty) { throw new DreamBadRequestException("document is empty"); } switch(type) { case "files": revisionUri = data.Meta["revision.uri"].AsUri; contentUri = data.Meta["content.uri"].AsText; if(string.IsNullOrEmpty(contentUri)) { throw new DreamBadRequestException(string.Format("missing content uri for '{0}'", data.Id)); } break; case "pages": revisionUri = data.Meta["revision.uri"].AsUri; contentUri = data.Meta["content.uri[@type='application/xml']"].AsText; if(string.IsNullOrEmpty(contentUri)) { throw new DreamBadRequestException(string.Format("missing xml content uri for '{0}'", data.Id)); } break; case "comments": revisionUri = data.Meta["uri"].AsUri; break; case "users": revisionUri = data.Meta["uri"].AsUri; break; } if(revisionUri == null) { throw new DreamBadRequestException(string.Format("missing revision uri for '{0}'", data.Id)); } Result<DreamMessage> revisionResult; _log.DebugFormat("fetching revision for {1} from {0}", data.Id, revisionUri); yield return revisionResult = Plug.New(revisionUri).With("apikey", _apikey).GetAsync(); if(!revisionResult.Value.IsSuccessful) { throw BadRequestException(revisionResult.Value, "unable to fetch revision info from '{0}' (status: {1})", data.Meta["revision.uri"].AsText, revisionResult.Value.Status); } revision = revisionResult.Value.ToDocument(); } _log.DebugFormat("deleting '{0}' from index using uri {1}", data.Id, oldDocUri); GetInstance(wikiid).DeleteDocuments(deleteTerm); // build new document string text = string.Empty; if(data.ActionStack.IsAdd) { _log.DebugFormat("adding '{0}' to index", data.Id); var d = new Document(); d.Add(new Field("uri", docId.ToString().ToLowerInvariant(), Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("mime", revision["contents/@type"].AsText ?? "", Field.Store.YES, Field.Index.TOKENIZED)); DateTime editDate; string editDateStringFromDoc = (type == "files") ? revision["date.created"].AsText : revision["date.edited"].AsText; DateTime.TryParse(editDateStringFromDoc, out editDate); if(type == "comments" && editDate == DateTime.MinValue) { // if editDate is still min, we didn't find an edit date and need to use post date DateTime.TryParse(revision["date.posted"].AsText, out editDate); } if(editDate != DateTime.MinValue) { var editDateString = editDate.ToUniversalTime().ToString("yyyyMMddHHmmss", System.Globalization.CultureInfo.InvariantCulture.DateTimeFormat); d.Add(new Field("date.edited", editDateString, Field.Store.YES, Field.Index.UN_TOKENIZED)); } string language = null; switch(type) { case "pages": { // filter what we actually index var ns = revision["namespace"].AsText; if(Array.IndexOf(_indexNamespaceWhitelist, ns) < 0) { _log.DebugFormat("not indexing '{0}', namespace '{1}' is not in whitelist", data.Id, ns); result.Return(); yield break; } string path = revision["path"].AsText ?? string.Empty; d.Add(new Field("path", path, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("id.page", revision["@id"].AsText ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("title", revision["title"].AsText ?? string.Empty, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("title.sort", revision["title"].AsText ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); d.Add(new Field("namespace", ns ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); d.Add(new Field("type", "wiki", Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("author", revision["user.author/username"].AsText ?? string.Empty, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("author.sort", revision["user.author/username"].AsText ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); // store the original page title in case display title was set int index = path.LastIndexOf('/'); if(index > 0) { path = path.Substring(index + 1); } d.Add(new Field("path.title", path, Field.Store.YES, Field.Index.TOKENIZED)); var pageUri = data.Meta["uri"].AsUri; _log.DebugFormat("fetching page info: {0}", pageUri); Result<DreamMessage> pageResult; yield return pageResult = Plug.New(pageUri).With("apikey", _apikey).GetAsync(); DreamMessage page = pageResult.Value; if(!page.IsSuccessful) { throw BadRequestException(page, "unable to fetch page data from '{0}' for '{1}'", contentUri, data.Id); } XDoc pageDoc = page.ToDocument(); var score = pageDoc["rating/@score"].AsText; if(!string.IsNullOrEmpty(score)) { d.Add(new Field("rating.score", score, Field.Store.YES, Field.Index.UN_TOKENIZED)); } d.Add(new Field("creator", pageDoc["user.createdby/username"].AsText ?? string.Empty, Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("date.created", DateTimeToString(pageDoc["date.created"].AsDate), Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("rating.count", pageDoc["rating/@count"].AsText ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("title.parent", pageDoc["page.parent/title"].AsText ?? "", Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("path.parent", pageDoc["page.parent/path"].AsText ?? "", Field.Store.YES, Field.Index.UN_TOKENIZED)); foreach(var ancestor in pageDoc["//page.parent/path"]) { var ancestorPath = ancestor.AsText; if(string.IsNullOrEmpty(ancestorPath)) { continue; } d.Add(new Field("path.ancestor", ancestorPath, Field.Store.YES, Field.Index.UN_TOKENIZED)); } var parentId = pageDoc["page.parent/@id"].AsUInt; if(parentId.HasValue) { d.Add(new Field("id.parent", parentId.Value.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); } // check if this is a redirect if(!pageDoc["page.redirectedto"].IsEmpty) { // redirect if(!(Config["index-redirects"].AsBool ?? false)) { _log.DebugFormat("indexing of redirects is disabled, not indexing '{0}'", data.Id); result.Return(); yield break; } _log.DebugFormat("indexing redirect, leave content empty"); d.Add(new Field("size", "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); } else { language = pageDoc["language"].AsText; // fetch the page _log.DebugFormat("fetching page content: {0}", contentUri); DreamMessage content = null; yield return Plug.New(contentUri).With("apikey", _apikey).WithTimeout(TimeSpan.FromMinutes(10)) .Get(new Result<DreamMessage>()) .Set(x => content = x); if(!content.IsSuccessful) { throw BadRequestException(content, "unable to fetch content from '{0}' for '{1}'", contentUri, data.Id); } text = _htmlConverter.Convert(content.ToDocument()); d.Add(new Field("size", content.ContentLength.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); } // process tags, if they exist if(!data.Meta["tags.uri"].IsEmpty) { Result<DreamMessage> tagsResult; yield return tagsResult = Plug.New(data.Meta["tags.uri"].AsUri).With("apikey", _apikey).GetAsync(); if(!tagsResult.Value.IsSuccessful) { throw BadRequestException(tagsResult.Value, "unable to fetch tags from '{0}' for '{1}'", data.Meta["tags.uri"].AsText, data.Id); } XDoc tags = tagsResult.Value.ToDocument(); StringBuilder sb = new StringBuilder(); foreach(XDoc v in tags["tag/@value"]) { sb.AppendFormat("{0}\n", v.AsText); } d.Add(new Field("tag", sb.ToString(), Field.Store.YES, Field.Index.TOKENIZED)); } //Save page properties yield return Coroutine.Invoke(AddPropertiesToDocument, d, pageDoc["properties"], new Result()); // set docuemnt boost based on namespace d.SetBoost(GetNamespaceBoost(revision["namespace"].AsText)); break; } case "files": { var ns = revision["page.parent/namespace"].AsText; if(Array.IndexOf(_indexNamespaceWhitelist, ns) < 0) { _log.DebugFormat("not indexing '{0}', namespace '{1}' is not in whitelist", data.Id, ns); result.Return(); yield break; } d.Add(new Field("namespace", ns ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); var filename = revision["filename"].AsText; string extension = Path.GetExtension(filename); d.Add(new Field("path", revision["page.parent/path"].AsText ?? string.Empty, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("title.page", revision["page.parent/title"].AsText ?? string.Empty, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("id.page", revision["page.parent/@id"].AsText ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("id.file", revision["@id"].AsText ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("extension", extension ?? string.Empty, Field.Store.NO, Field.Index.TOKENIZED)); d.Add(new Field("filename", filename ?? string.Empty, Field.Store.NO, Field.Index.TOKENIZED)); d.Add(new Field("title", filename ?? string.Empty, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("title.sort", filename ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); d.Add(new Field("author", revision["user.createdby/username"].AsText ?? string.Empty, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("author.sort", revision["user.createdby/username"].AsText ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); d.Add(new Field("description", revision["description"].AsText ?? string.Empty, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("type", GetDocumentType(extension), Field.Store.YES, Field.Index.UN_TOKENIZED)); // convert binary types to text Result<Tuplet<string, int>> contentResult; yield return contentResult = Coroutine.Invoke(ConvertToText, extension, new XUri(contentUri), new Result<Tuplet<string, int>>()); Tuplet<string, int> content = contentResult.Value; text = content.Item1; var size = content.Item2; if(size == 0) { // since ConvertToText only gets the byte size if there is a converter for the filetype, // we fall back to the size in the document if it comes back as zero size = revision["contents/@size"].AsInt ?? 0; } d.Add(new Field("size", size.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); break; } case "comments": { var ns = revision["page.parent/namespace"].AsText; if(Array.IndexOf(_indexNamespaceWhitelist, ns) < 0) { _log.DebugFormat("not indexing '{0}', namespace '{1}' is not in whitelist", data.Id, ns); result.Return(); yield break; } d.Add(new Field("namespace", ns ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); text = revision["content"].AsText ?? string.Empty; d.Add(new Field("comments", text, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("type", "comment", Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("path", revision["page.parent/path"].AsText ?? string.Empty, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("id.page", revision["page.parent/@id"].AsText ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("title.page", revision["page.parent/title"].AsText ?? string.Empty, Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("id.comment", revision["@id"].AsText ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); string title = "Comment #" + revision["number"].AsInt; d.Add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("title.sort", title, Field.Store.NO, Field.Index.UN_TOKENIZED)); string author = revision["user.editedby/username"].AsText ?? revision["user.createdby/username"].AsText ?? ""; d.Add(new Field("author", author, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("author.sort", author, Field.Store.NO, Field.Index.UN_TOKENIZED)); break; } case "users": { d.Add(new Field("type", "user", Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("id.user", revision["@id"].AsText, Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("username", revision["username"].AsText, Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("email", revision["email"].AsText ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); var fullname = revision["fullname"].AsText ?? string.Empty; d.Add(new Field("fullname", fullname, Field.Store.YES, Field.Index.ANALYZED)); d.Add(new Field("fullname.sort", fullname, Field.Store.NO, Field.Index.NOT_ANALYZED)); d.Add(new Field("date.lastlogin", DateTimeToString(revision["date.lastlogin"].AsDate), Field.Store.NO, Field.Index.UN_TOKENIZED)); d.Add(new Field("date.created", DateTimeToString(revision["date.created"].AsDate), Field.Store.YES, Field.Index.UN_TOKENIZED)); d.Add(new Field("language", revision["language"].AsText ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); d.Add(new Field("service.authentication.id", revision["service.authentication/@id"].AsText ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); foreach(XDoc group in revision["groups/group"]) { d.Add(new Field("group.id", group["@id"].AsText ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); d.Add(new Field("group", group["groupname"].AsText ?? string.Empty, Field.Store.NO, Field.Index.UN_TOKENIZED)); } // NOTE (MaxM): User properties are only automatically included for current user so they need to be retrieved. Result<DreamMessage> propertyResult; yield return propertyResult = Plug.New(revisionUri).At("properties").With("apikey", _apikey).GetAsync(); if(!propertyResult.Value.IsSuccessful) { throw BadRequestException(propertyResult.Value, "unable to fetch properties for user id '{0}' for '{1}'", revision["@id"].AsText, data.Id); } XDoc propertiesDoc = propertyResult.Value.ToDocument(); // Save user properties yield return Coroutine.Invoke(AddPropertiesToDocument, d, propertiesDoc, new Result()); break; } }// switch(type) string preview = text; if(preview.Length > _previewLength) { preview = preview.Substring(0, _previewLength); } d.Add(new Field("content", text, Field.Store.NO, Field.Index.TOKENIZED)); d.Add(new Field("preview", preview, Field.Store.YES, Field.Index.TOKENIZED)); d.Add(new Field("wordcount", _wordcountRegex.Matches(text).Count.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); if(type == "files" || type == "comments") { // fetch parent page for language string parentUri = revision["page.parent/@href"].AsText; if(!string.IsNullOrEmpty(parentUri)) { Result<DreamMessage> parentResult; yield return parentResult = Plug.New(parentUri).With("apikey", _apikey).GetAsync(); if(!parentResult.Value.IsSuccessful) { throw new DreamBadRequestException(string.Format("unable to fetch parent from '{0}' for '{1}'", contentUri, data.Id)); } XDoc parent = parentResult.Value.ToDocument(); language = parent["language"].AsText; } } if(string.IsNullOrEmpty(language)) { language = "neutral"; } d.Add(new Field("language", language, Field.Store.YES, Field.Index.UN_TOKENIZED)); _log.DebugFormat("Adding document for '{0}' to index", data.Id); GetInstance(wikiid).AddDocument(d); } _log.DebugFormat("completed indexing '{0}'", data.Id); result.Return(); }