public void TestRollbackIntegrityWithBufferFlush() { Directory dir = new MockRAMDirectory(); IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); for (int i = 0; i < 5; i++) { Document doc = new Document(); doc.Add(new Field("pk", i.ToString(), Field.Store.YES, Field.Index.ANALYZED_NO_NORMS)); w.AddDocument(doc); } w.Close(); // If buffer size is small enough to cause a flush, errors ensue... w = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); w.SetMaxBufferedDocs(2); Term pkTerm = new Term("pk", ""); for (int i = 0; i < 3; i++) { Document doc = new Document(); String value = i.ToString(); doc.Add(new Field("pk", value, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS)); doc.Add(new Field("text", "foo", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS)); w.UpdateDocument(pkTerm.CreateTerm(value), doc); } w.Rollback(); IndexReader r = IndexReader.Open(dir, true); Assert.AreEqual(5, r.NumDocs(), "index should contain same number of docs post rollback"); r.Close(); dir.Close(); }
// ... has multiple qualifications private Document MakeQualification(string qualification, int year) { Document job = new Document(); job.Add(NewStringField("qualification", qualification, Field.Store.YES)); job.Add(new IntField("year", year, Field.Store.NO)); return job; }
/// <summary> /// BarThread转换成<see cref="Lucene.Net.Documents.Document"/> /// </summary> /// <param name="BarThread">发帖实体</param> /// <returns>Lucene.Net.Documents.Document</returns> public static Document Convert(BarThread barThread) { Document doc = new Document(); //索引发帖基本信息 doc.Add(new Field(BarIndexDocument.SectionId, barThread.SectionId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(BarIndexDocument.ThreadId, barThread.ThreadId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(BarIndexDocument.PostId, "0", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(BarIndexDocument.Subject, barThread.Subject.ToLower(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(BarIndexDocument.Body, HtmlUtility.StripHtml(barThread.GetBody(), true, false).ToLower(), Field.Store.NO, Field.Index.ANALYZED)); doc.Add(new Field(BarIndexDocument.Author, barThread.Author, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(BarIndexDocument.IsPost, "0", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(BarIndexDocument.DateCreated, DateTools.DateToString(barThread.DateCreated, DateTools.Resolution.DAY), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(BarIndexDocument.TenantTypeId, barThread.TenantTypeId, Field.Store.YES, Field.Index.NOT_ANALYZED)); //索引发帖tag TagService tagService = new TagService(TenantTypeIds.Instance().BarThread()); IEnumerable<ItemInTag> itemInTags = tagService.GetItemInTagsOfItem(barThread.ThreadId); foreach (ItemInTag itemInTag in itemInTags) { doc.Add(new Field(BarIndexDocument.Tag, itemInTag.TagName.ToLower(), Field.Store.YES, Field.Index.ANALYZED)); } return doc; }
public void TestReverse() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); Document doc = new Document(); doc.Add(NewStringField("value", "foo", Field.Store.NO)); doc.Add(NewStringField("value", "bar", Field.Store.NO)); doc.Add(NewStringField("id", "1", Field.Store.YES)); writer.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("value", "baz", Field.Store.NO)); doc.Add(NewStringField("id", "2", Field.Store.YES)); writer.AddDocument(doc); IndexReader ir = writer.Reader; writer.Dispose(); IndexSearcher searcher = NewSearcher(ir); Sort sort = new Sort(new SortedSetSortField("value", true)); TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); assertEquals(2, td.TotalHits); // 'bar' comes before 'baz' assertEquals("2", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); assertEquals("1", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); ir.Dispose(); dir.Dispose(); }
public virtual void TestNGramPrefixGridLosAngeles() { SpatialContext ctx = SpatialContext.GEO; TermQueryPrefixTreeStrategy prefixGridStrategy = new TermQueryPrefixTreeStrategy(new QuadPrefixTree(ctx), "geo"); Spatial4n.Core.Shapes.IShape point = ctx.MakePoint(-118.243680, 34.052230); Document losAngeles = new Document(); losAngeles.Add(new StringField("name", "Los Angeles", Field.Store.YES)); foreach (IndexableField field in prefixGridStrategy.CreateIndexableFields(point)) { losAngeles.Add(field); } losAngeles.Add(new StoredField(prefixGridStrategy.FieldName, point.toString()));//just for diagnostics addDocumentsAndCommit(Arrays.AsList(losAngeles)); // This won't work with simple spatial context... SpatialArgsParser spatialArgsParser = new SpatialArgsParser(); // TODO... use a non polygon query // SpatialArgs spatialArgs = spatialArgsParser.parse( // "Intersects(POLYGON((-127.00390625 39.8125,-112.765625 39.98828125,-111.53515625 31.375,-125.94921875 30.14453125,-127.00390625 39.8125)))", // new SimpleSpatialContext()); // Query query = prefixGridStrategy.makeQuery(spatialArgs, fieldInfo); // SearchResults searchResults = executeQuery(query, 1); // assertEquals(1, searchResults.numFound); }
/// <summary> /// MicroblogEntity转换成<see cref="Lucene.Net.Documents.Document"/> /// </summary> /// <param name="microblog">微博实体</param> /// <returns>Lucene.Net.Documents.Document</returns> public static Document Convert(MicroblogEntity microblog) { Document doc = new Document(); //索引微博基本信息 doc.Add(new Field(MicroblogIndexDocument.MicroblogId, microblog.MicroblogId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); if (microblog.OriginalMicroblog != null) { doc.Add(new Field(MicroblogIndexDocument.Body, HtmlUtility.StripHtml(microblog.Body, true, false).ToLower() + HtmlUtility.StripHtml(microblog.OriginalMicroblog.Body, true, false).ToLower(), Field.Store.NO, Field.Index.ANALYZED)); } else { doc.Add(new Field(MicroblogIndexDocument.Body, HtmlUtility.StripHtml(microblog.Body, true, false).ToLower(), Field.Store.NO, Field.Index.ANALYZED)); } doc.Add(new Field(MicroblogIndexDocument.DateCreated, DateTools.DateToString(microblog.DateCreated, DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(MicroblogIndexDocument.HasMusic, microblog.HasMusic ? "1" : "0", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(MicroblogIndexDocument.HasPhoto, microblog.HasPhoto ? "1" : "0", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(MicroblogIndexDocument.HasVideo, microblog.HasVideo ? "1" : "0", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(MicroblogIndexDocument.IsOriginality, microblog.ForwardedMicroblogId == 0 ? "1" : "0", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(MicroblogIndexDocument.TenantTypeId, microblog.TenantTypeId, Field.Store.YES, Field.Index.NOT_ANALYZED)); TagService tagService = new TagService(TenantTypeIds.Instance().Microblog()); IEnumerable<ItemInTag> itemInTags = tagService.GetItemInTagsOfItem(microblog.MicroblogId); foreach (ItemInTag itemInTag in itemInTags) { doc.Add(new Field(MicroblogIndexDocument.Topic, itemInTag.TagName.ToLower(), Field.Store.YES, Field.Index.ANALYZED)); } return doc; }
public void MyTestMethod_index() { string strIndexDir = @"D:\Index"; Lucene.Net.Store.Directory indexDir = Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)); Analyzer std = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); //Version parameter is used for backward compatibility. Stop words can also be passed to avoid indexing certain words using (IndexWriter idxw = new IndexWriter(indexDir, std, true, IndexWriter.MaxFieldLength.UNLIMITED)) //Create an Index writer object. { Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); //var file = System.IO.File.ReadAllText( // @"d:\test.txt"); Lucene.Net.Documents.Field fldText = new Lucene.Net.Documents.Field("text", file, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index. ANALYZED, Lucene.Net.Documents.Field. TermVector.YES); doc.Add(fldText); doc.Add(new Field("addtime", System.DateTime.Now.ToString(), Lucene.Net.Documents.Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); //write the document to the index idxw.AddDocument(doc); //optimize and close the writer idxw.Optimize(); } Console.WriteLine("Indexing Done"); }
public override void SetUp() { base.SetUp(); _dir = NewDirectory(); _indexWriter = new RandomIndexWriter(Random(), _dir, new MockAnalyzer(Random()), Similarity, TimeZone); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; Analyzer analyzer = new MockAnalyzer(Random()); Document doc; for (int i = 0; i < 100; i++) { doc = new Document(); doc.Add(new Field(_idFieldName, Random().toString(), ft)); doc.Add(new Field(_textFieldName, new StringBuilder(Random().toString()).append(Random().toString()).append( Random().toString()).toString(), ft)); doc.Add(new Field(_classFieldName, Random().toString(), ft)); _indexWriter.AddDocument(doc, analyzer); } _indexWriter.Commit(); _originalIndex = SlowCompositeReaderWrapper.Wrap(_indexWriter.Reader); }
private static void AddTextToIndex(int txts, string text, IndexWriter writer) { Document doc = new Document(); doc.Add(new Field("id", txts.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("postBody", text, Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); }
public virtual void TestPositionIncrementGap() { Analyzer analyzer = new AnonymousClassAnalyzer(this); IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("repeated", "repeated one", Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("repeated", "repeated two", Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); writer.Commit(); SegmentInfo info = writer.NewestSegment(); writer.Close(); SegmentReader reader = SegmentReader.Get(true, info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); TermPositions termPositions = reader.TermPositions(new Term("repeated", "repeated")); Assert.IsTrue(termPositions.Next()); int freq = termPositions.Freq; Assert.AreEqual(2, freq); Assert.AreEqual(0, termPositions.NextPosition()); Assert.AreEqual(502, termPositions.NextPosition()); }
// TODO: refactor call interface: way too many parameters to be legible. public void AddDocumentMetadata(bool is_deleted, string fingerprint, string title, string author, string year, string comment, string tag, string annotation, string bibtex, Utilities.BibTex.Parsing.BibTexItem bibtex_item) { Lucene.Net.Documents.Document document = null; // Create the document only if it is not to be deleted if (!is_deleted) { document = new Lucene.Net.Documents.Document(); document.Add(new Field("fingerprint", fingerprint, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); document.Add(new Field("page", "0", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); StringBuilder content_sb = new StringBuilder(); AddDocumentMetadata_SB(document, content_sb, "title", title); AddDocumentMetadata_SB(document, content_sb, "author", author); AddDocumentMetadata_SB(document, content_sb, "year", year); AddDocumentMetadata_SB(document, content_sb, "comment", comment); AddDocumentMetadata_SB(document, content_sb, "tag", tag); AddDocumentMetadata_SB(document, content_sb, "annotation", annotation); AddDocumentMetadata_SB(document, content_sb, "bibtex", bibtex); AddDocumentMetadata_BibTex(document, bibtex_item); string content = content_sb.ToString(); document.Add(new Field("content", content, Field.Store.NO, Field.Index.ANALYZED)); } AddDocumentPage_INTERNAL(fingerprint, 0, document); }
/// <summary> /// /// </summary> /// <param name="p"></param> /// <param name="writer"></param> private static void AddDocumentToIndex(Product p, IndexWriter writer) { Document doc = new Document(); doc.Add(new Field("Name", p.Name, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES ) ); doc.Add(new Field("Origin", p.Origin.ToString(), Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES ) ); doc.Add(new Field("Price", p.Price.ToString(), Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES ) ); writer.AddDocument(doc); }
protected override void AddSpecialFields(Document document, Item item) { Assert.ArgumentNotNull(document, "document"); Assert.ArgumentNotNull(item, "item"); document.Add(this.CreateTextField(BuiltinFields.Name, item.Name)); document.Add(this.CreateDataField(BuiltinFields.Name, item.Name)); this.DetectRemovalFilterAndProcess(document, item, "DisplayName", BuiltinFields.Name, (itm) => item.Appearance.DisplayName); this.DetectRemovalFilterValueField(document, item, "Icon", BuiltinFields.Icon, itm => itm.Appearance.Icon); this.DetectRemovalFilterAndProcess(document, item, "Creator", BuiltinFields.Creator, itm => itm.Statistics.CreatedBy); this.DetectRemovalFilterAndProcess(document, item, "Editor", BuiltinFields.Editor, itm => itm.Statistics.UpdatedBy); this.DetectRemovalFilterAndProcess(document, item, "AllTemplates", BuiltinFields.AllTemplates, this.GetAllTemplates); this.DetectRemovalFilterAndProcess(document, item, "TemplateName", BuiltinFields.TemplateName, itm => itm.TemplateName); if (this.DetectRemoval("Hidden")) { if (this.IsHidden(item)) { this.DetectRemovalFilterValueField(document, item, "Hidden", BuiltinFields.Hidden, itm => "1"); } } this.DetectRemovalFilterValueField(document, item, "Created", BuiltinFields.Created, itm => item[FieldIDs.Created]); this.DetectRemovalFilterValueField(document, item, "Updated", BuiltinFields.Updated, itm => item[FieldIDs.Updated]); this.DetectRemovalFilterAndProcess(document, item, "Path", BuiltinFields.Path, this.GetItemPath); this.DetectRemovalFilterAndProcess(document, item, "Links", BuiltinFields.Links, this.GetItemLinks); var tags = this.Tags; if (tags.Length > 0) { document.Add(this.CreateTextField(BuiltinFields.Tags, tags)); document.Add(this.CreateDataField(BuiltinFields.Tags, tags)); } }
public void CreateSearchIndex() { directory = new RAMDirectory(); analyzer = new StandardAnalyzer(Version.LUCENE_30); var ixw = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); LookupTable = new Dictionary<string, BaseContent>(); foreach (BaseContent p in Service.PoIs.ToList()) { var document = new Document(); document.Add(new Field("id", p.Id.ToString(), Field.Store.YES, Field.Index.NO, Field.TermVector.NO)); string all = p.Name + " "; foreach (MetaInfo mi in p.EffectiveMetaInfo) { string value; if (mi.Type != MetaTypes.text || !p.Labels.TryGetValue(mi.Label, out value)) continue; document.Add(new Field(mi.Label, value, Field.Store.YES, Field.Index.ANALYZED)); all += value + " "; } document.Add(new Field("All", all, Field.Store.YES, Field.Index.ANALYZED)); LookupTable[p.Id.ToString()] = p; ixw.AddDocument(document); } ixw.Commit(); }
public static Document readTXT(string path) { Document doc = new Document(); doc.Add(new Field("Path", path, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Content" , readText(path) , Field.Store.YES, Field.Index.ANALYZED)) ; return doc; }
/* * Phương thức đánh chỉ mục FILE */ private static void BuildIndexFiles(string file, StandardAnalyzer analyzer, FSDirectory indexDir, IndexWriter indexWriter) { StringBuilder toText = new StringBuilder(); LDocument document; switch (getExtension(file)) { case ".docx": toText = WordToText(file); break; case ".pdf": toText = PdfToText(file); break; case ".txt": toText = TxtToText(file); break; } // File Indexing document = new LDocument(); document.Add(new Field("Filename", file, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("Path", file, Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("Content", toText.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); indexWriter.Optimize(); indexWriter.Flush(false, false, false); }
public void CreateIndex(Analyzer analayer) { FSDirectory fsDir = new SimpleFSDirectory(new DirectoryInfo(_indexerFolder)); IndexWriter indexWriter = new IndexWriter(fsDir, analayer, true, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); string[] files = System.IO.Directory.GetFiles(_textFilesFolder, Config.FileSearchPattern, SearchOption.AllDirectories); foreach (string file in files) { string name = new FileInfo(file).Name; string content = File.ReadAllText(file); Document doc = new Document(); doc.Add(new Field(Config.Field_Path, file, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(Config.Field_Name, name, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(Config.Field_Content, content, Field.Store.NO, Field.Index.ANALYZED)); indexWriter.AddDocument(doc); Console.WriteLine("{0} - {1}", file, name); } indexWriter.Optimize(); indexWriter.Dispose(); Console.WriteLine("File count: {0}", files.Length); }
private static void IndexIndicator(IndicatorMetadata indicatorMetadata, IEnumerable<IndicatorMetadataTextProperty> properties, IndexWriter writer) { Document doc = new Document(); doc.Add(new Field("id", indicatorMetadata.IndicatorId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); var text = indicatorMetadata.Descriptive; StringBuilder sb = new StringBuilder(); foreach (var indicatorMetadataTextProperty in properties) { var key = indicatorMetadataTextProperty.ColumnName; if (text.ContainsKey(key)) { sb.Append(text[key]); sb.Append(" "); } } doc.Add(new Field("IndicatorText", sb.ToString().ToLower(), Field.Store.NO, Field.Index.ANALYZED)); writer.AddDocument(doc); }
public static void IndexTopics(CSETWebEntities entity, IndexWriter writer) { foreach (CATALOG_RECOMMENDATIONS_DATA data in entity.CATALOG_RECOMMENDATIONS_DATA) { Lucene.Net.Documents.Document lucDoc = new Lucene.Net.Documents.Document(); string text = ""; text += " " + data.Heading + " " + data.Requirement + " " + data.Supplemental_Guidance + " " + data.Enhancement; lucDoc.Add(new Field(FieldNames.SHORT_NAME, data.Topic_Name, Field.Store.YES, Field.Index.ANALYZED)); lucDoc.Add(new Field(FieldNames.TEXT, text, Field.Store.YES, Field.Index.ANALYZED)); lucDoc.Add(new Field(FieldNames.RESOURCE_TYPE, ResourceTypeEnum.Catalog_Recommendation.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); lucDoc.Add(new Field(FieldNames.DOC_ID, data.Data_Id.ToString(), Field.Store.YES, Field.Index.NO)); writer.AddDocument(lucDoc); } foreach (PROCUREMENT_LANGUAGE_DATA data in entity.PROCUREMENT_LANGUAGE_DATA) { Lucene.Net.Documents.Document lucDoc = new Lucene.Net.Documents.Document(); string text = ""; text += " " + data.Basis + " " + data.Language_Guidance + " " + data.Procurement_Language + " " + data.Fatmeasures + " " + data.Satmeasures + " " + data.Maintenance_Guidance; lucDoc.Add(new Field(FieldNames.SHORT_NAME, data.Topic_Name, Field.Store.YES, Field.Index.ANALYZED)); lucDoc.Add(new Field(FieldNames.TEXT, text, Field.Store.YES, Field.Index.ANALYZED)); lucDoc.Add(new Field(FieldNames.RESOURCE_TYPE, ResourceTypeEnum.Procurement_Language.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); lucDoc.Add(new Field(FieldNames.DOC_ID, data.Procurement_Id.ToString(), Field.Store.YES, Field.Index.NO)); writer.AddDocument(lucDoc); } }
public virtual void TestMixedTermVectorSettingsSameField() { Document doc = new Document(); // f1 first without tv then with tv doc.Add(new Field("f1", "v1", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.NO)); doc.Add(new Field("f1", "v2", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); // f2 first with tv then without tv doc.Add(new Field("f2", "v1", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("f2", "v2", Field.Store.YES, Field.Index.NOT_ANALYZED, TermVector.NO)); IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED); writer.AddDocument(doc); writer.Close(); _TestUtil.CheckIndex(dir); IndexReader reader = IndexReader.Open(dir, true); // f1 ITermFreqVector tfv1 = reader.GetTermFreqVector(0, "f1"); Assert.IsNotNull(tfv1); Assert.AreEqual(2, tfv1.GetTerms().Length, "the 'with_tv' setting should rule!"); // f2 ITermFreqVector tfv2 = reader.GetTermFreqVector(0, "f2"); Assert.IsNotNull(tfv2); Assert.AreEqual(2, tfv2.GetTerms().Length, "the 'with_tv' setting should rule!"); }
public void CreateIndex() { Analyzer analyzer = new MockAnalyzer(Random()); IndexWriter writer = new IndexWriter (dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); try { for (int docid = 0; docid < NUM_DOCS; docid++) { Document d = new Document(); d.Add(NewStringField("docid", "" + docid, Field.Store.YES)); d.Add(NewStringField("never_load", "fail", Field.Store.YES)); foreach (string f in FIELDS) { for (int val = 0; val < NUM_VALUES; val++) { d.Add(NewStringField(f, docid + "_" + f + "_" + val, Field.Store.YES)); } } d.Add(NewStringField("load_later", "yes", Field.Store.YES)); writer.AddDocument(d); } } finally { writer.Dispose(); } }
public virtual void TestLUCENE_1590() { Document doc = new Document(); // f1 has no norms doc.Add(new Field("f1", "v1", Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)); doc.Add(new Field("f1", "v2", Field.Store.YES, Field.Index.NO)); // f2 has no TF Field f = new Field("f2", "v1", Field.Store.NO, Field.Index.ANALYZED); f.OmitTermFreqAndPositions = true; doc.Add(f); doc.Add(new Field("f2", "v2", Field.Store.YES, Field.Index.NO)); IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED); writer.AddDocument(doc); writer.Optimize(); // be sure to have a single segment writer.Close(); _TestUtil.CheckIndex(dir); SegmentReader reader = SegmentReader.GetOnlySegmentReader(dir); FieldInfos fi = reader.FieldInfos(); // f1 Assert.IsFalse(reader.HasNorms("f1"), "f1 should have no norms"); Assert.IsFalse(fi.FieldInfo("f1").omitTermFreqAndPositions_ForNUnit, "omitTermFreqAndPositions field bit should not be set for f1"); // f2 Assert.IsTrue(reader.HasNorms("f2"), "f2 should have norms"); Assert.IsTrue(fi.FieldInfo("f2").omitTermFreqAndPositions_ForNUnit, "omitTermFreqAndPositions field bit should be set for f2"); }
private IndexWriter InitIndex(IConcurrentMergeScheduler scheduler, Random random, MockDirectoryWrapper dir, bool initialCommit) { dir.LockFactory = NoLockFactory.DoNoLockFactory; scheduler.SetSuppressExceptions(); IndexWriter writer = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) .SetMaxBufferedDocs(10) .SetMergeScheduler(scheduler)); if (initialCommit) { writer.Commit(); } Document doc = new Document(); doc.Add(NewTextField("content", "aaa", Field.Store.NO)); doc.Add(NewTextField("id", "0", Field.Store.NO)); for (int i = 0; i < 157; i++) { writer.AddDocument(doc); } return writer; }
public void IndexDocuments(IEnumerable <Document> documents) { try { var analyzer = new StandardAnalyzer(Version.LUCENE_30); bool createIndexFiles = !LuceneDirectory.FileExists("segments.gen"); using (var writer = new IndexWriter(LuceneDirectory, analyzer, createIndexFiles, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED)) { try { foreach (var document in documents.Where(d => d.FileExtension == ".pdf")) { string documentBody = GetPlainTextFromDocument(document); var doc = new LuceneDocument(); doc.Add(new Field("FileID", document.FileID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("Title", document.Title, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Body", documentBody, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); } writer.Optimize(); } catch { } finally { analyzer.Close(); } } } catch { } }
private static void AddDocuments(IndexWriter writer) { var pages = PagesMetadata.Instance; var posts = PostsMetadata.Instance; foreach (var page in pages.List) { var doc = new Document(); doc.Add(new Field("Url", "/" + page.Slug, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("Title", page.Title, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Body", new Page(page.Slug, pages).BodyWithoutHtml, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); } foreach (var post in posts.List) { var doc = new Document(); doc.Add(new Field("Url", "/blog/" + post.Slug, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("Title", post.Title, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Description", post.ShortDescription, Field.Store.YES, Field.Index.ANALYZED)); if (post.PublishDate != DateTime.MinValue) doc.Add(new Field("PublishDate", post.PublishDate.ToString("dd MMMM yyyy"), Field.Store.YES, Field.Index.NOT_ANALYZED)); if (post.LastUpdatedDate != DateTime.MinValue) doc.Add(new Field("LastUpdatedDate", post.LastUpdatedDate.ToString("dd MMMM yyyy"), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("Author", post.Author, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Body", new Post(post.Slug, posts).BodyWithoutHtml, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); } }
public override void WriteEndVersion(Process process, AbstractConnection input, Entity entity, bool force = false) { if (entity.Updates + entity.Inserts <= 0 && !force) return; var versionType = entity.Version == null ? "string" : entity.Version.SimpleType; var end = entity.End ?? new DefaultFactory(Logger).Convert(entity.End, versionType); using (var dir = LuceneDirectoryFactory.Create(this, TflBatchEntity(entity.ProcessName))) { using (var writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED)) { var doc = new Document(); doc.Add(new NumericField("id", global::Lucene.Net.Documents.Field.Store.YES, true).SetIntValue(entity.TflBatchId)); doc.Add(new global::Lucene.Net.Documents.Field("process", entity.ProcessName, global::Lucene.Net.Documents.Field.Store.YES, global::Lucene.Net.Documents.Field.Index.NOT_ANALYZED_NO_NORMS)); doc.Add(new global::Lucene.Net.Documents.Field("connection", input.Name, global::Lucene.Net.Documents.Field.Store.YES, global::Lucene.Net.Documents.Field.Index.NOT_ANALYZED_NO_NORMS)); doc.Add(new global::Lucene.Net.Documents.Field("entity", entity.Alias, global::Lucene.Net.Documents.Field.Store.YES, global::Lucene.Net.Documents.Field.Index.NOT_ANALYZED_NO_NORMS)); doc.Add(new NumericField("updates", global::Lucene.Net.Documents.Field.Store.YES, true).SetLongValue(entity.Updates)); doc.Add(new NumericField("inserts", global::Lucene.Net.Documents.Field.Store.YES, true).SetLongValue(entity.Inserts)); doc.Add(new NumericField("deletes", global::Lucene.Net.Documents.Field.Store.YES, true).SetLongValue(entity.Deletes)); doc.Add(LuceneWriter.CreateField("version", versionType, new SearchType { Analyzer = "keyword" }, end)); doc.Add(new global::Lucene.Net.Documents.Field("version_type", versionType, global::Lucene.Net.Documents.Field.Store.YES, global::Lucene.Net.Documents.Field.Index.NOT_ANALYZED_NO_NORMS)); doc.Add(new NumericField("tflupdate", global::Lucene.Net.Documents.Field.Store.YES, true).SetLongValue(DateTime.UtcNow.Ticks)); writer.AddDocument(doc); writer.Commit(); writer.Optimize(); } } }
public virtual void TestBadPrefixTreePrune() { trie = new QuadPrefixTree(ctx, 12); TermQueryPrefixTreeStrategy strategy = new TermQueryPrefixTreeStrategy(trie, "geo"); Document doc = new Document(); doc.Add(new TextField("id", "1", Field.Store.YES)); IShape area = ctx.MakeRectangle(-122.82, -122.78, 48.54, 48.56); Field[] fields = strategy.CreateIndexableFields(area, 0.025); foreach (Field field in fields) { doc.Add(field); } AddDocument(doc); IPoint upperleft = ctx.MakePoint(-122.88, 48.54); IPoint lowerright = ctx.MakePoint(-122.82, 48.62); Query query = strategy.MakeQuery(new SpatialArgs(SpatialOperation.Intersects, ctx.MakeRectangle(upperleft, lowerright))); Commit(); TopDocs search = indexSearcher.Search(query, 10); ScoreDoc[] scoreDocs = search.ScoreDocs; foreach (ScoreDoc scoreDoc in scoreDocs) { Console.WriteLine(indexSearcher.Doc(scoreDoc.Doc)); } assertEquals(1, search.TotalHits); }
/// <summary> /// 创建索引文档 /// </summary> /// <param name="dic"></param> public void AddLuceneIndex(Dictionary<string, string> dic) { //var analyzer = new StandardAnalyzer(Version.LUCENE_30); var analyzer = GetAnalyzer(); using (var directory = GetLuceneDirectory()) using (var writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) { var doc = new Document(); foreach (KeyValuePair<string, string> pair in dic) { // add new index entry //Field.Store.YES:表示是否存储原值。 //只有当Field.Store.YES在后面才能用doc.Get("number")取出值来 //Field.Index. NOT_ANALYZED:不进行分词保存 //todo:boost if (NotAnalyzeFields.Exists(one => one == pair.Key)) { doc.Add(new Field(pair.Key, pair.Value, Field.Store.YES, Field.Index.NOT_ANALYZED)); } else { doc.Add(new Field(pair.Key, pair.Value, Field.Store.YES, Field.Index.ANALYZED)); } } //doc.Boost writer.AddDocument(doc); writer.Commit(); writer.Optimize(); analyzer.Close(); } }
public void CreateIndex() { IProductService productService = new ProductService(); int count = productService.GetProductCount(string.Empty); var data = productService.GetProducts(count, 1, string.Empty); //设置为多文件索引的格式,默认情况下为true,会建立复合索引的文件结构,这里为了分析,先设置为false,生成多文件的索引结构 //this.indexWriter.SetUseCompoundFile(false); foreach (var productInfo in data) { var doc = new Document(); var field1 = new Field("title", productInfo.Title, Field.Store.YES, Field.Index.ANALYZED); // 向文档中添加域 doc.Add(field1); field1 = new Field("Category", productInfo.CategoryName, Field.Store.YES, Field.Index.ANALYZED); doc.Add(field1); field1 = new Field("Desc", productInfo.Desc??"", Field.Store.YES, Field.Index.ANALYZED); doc.Add(field1); this.indexWriter.AddDocument(doc); } // 优化索引结构 this.indexWriter.Optimize(); this.indexWriter.Commit(); // 关闭写入 this.indexWriter.Close(); }
public Engine() { var directory = new RAMDirectory(); var analyzer = new StandardAnalyzer(Version.LUCENE_30); using (var indexWriter = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED)) { for (int i = 0; i < 10000; i++) { Console.Write("."); var document = new Document(); document.Add(new Field("Id", i.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("Name", "Name" + i.ToString(), Field.Store.YES, Field.Index.ANALYZED)); indexWriter.AddDocument(document); } } Console.ReadKey(); var queryParser = new QueryParser(Version.LUCENE_30, "Name", analyzer); var query = queryParser.Parse("Name37~"); IndexReader indexReader = IndexReader.Open(directory, true); var searcher = new IndexSearcher(indexReader); TopDocs resultDocs = searcher.Search(query, indexReader.MaxDoc); }
public void Set(string name, object value, Document document, Field.Store store, Field.Index index, float? boost) { DateTime date = (DateTime) value; int year = date.Year; int month = date.Month; int day = date.Day; // set year Field field = new Field(name + ".year", year.ToString(), store, index); if (boost != null) { field.SetBoost(boost.Value); } document.Add(field); // set month and pad it if necessary field = new Field(name + ".month", month.ToString("D2"), store, index); if (boost != null) { field.SetBoost(boost.Value); } document.Add(field); // set day and pad it if necessary field = new Field(name + ".day", day.ToString("D2"), store, index); if (boost != null) { field.SetBoost(boost.Value); } document.Add(field); throw new NotImplementedException(); }
public Document BuildRecord() { var doc = new Document(); var numericField = new NumericField("DatabaseID", Field.Store.YES, false); numericField.SetIntValue(Email.ID); doc.Add(numericField); var field = new Field("UniqueID", UniqueID, Field.Store.YES, Field.Index.NOT_ANALYZED); doc.Add(field); field = new Field("Title", Title, Field.Store.YES, Field.Index.NOT_ANALYZED); doc.Add(field); field = new Field("Description", Description, Field.Store.YES, Field.Index.NOT_ANALYZED); doc.Add(field); field = new Field("Type", Type, Field.Store.YES, Field.Index.ANALYZED); doc.Add(field); /* field = new Field("Name", EventDescription.Name, Field.Store.YES, Field.Index.ANALYZED); doc.Add(field);*/ return doc; }
static void AddDocument(int id, string title, IndexWriter writer) { Document doc = new Document(); doc.Add(new Field("id", id.ToString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED)); writer.AddDocument(doc); }
public virtual void TestRollbackIntegrityWithBufferFlush() { Directory dir = NewDirectory(); RandomIndexWriter rw = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); for (int i = 0; i < 5; i++) { Document doc = new Document(); doc.Add(NewStringField("pk", Convert.ToString(i), Field.Store.YES)); rw.AddDocument(doc); } rw.Dispose(); // If buffer size is small enough to cause a flush, errors ensue... IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMaxBufferedDocs(2).SetOpenMode(IndexWriterConfig.OpenMode_e.APPEND)); for (int i = 0; i < 3; i++) { Document doc = new Document(); string value = Convert.ToString(i); doc.Add(NewStringField("pk", value, Field.Store.YES)); doc.Add(NewStringField("text", "foo", Field.Store.YES)); w.UpdateDocument(new Term("pk", value), doc); } w.Rollback(); IndexReader r = DirectoryReader.Open(dir); Assert.AreEqual(5, r.NumDocs, "index should contain same number of docs post rollback"); r.Dispose(); dir.Dispose(); }
public void AddOrUpdateDocuments(params CmsDocument[] documents) { DeleteDocuments(documents); using (var writer = new IndexWriter(_Directory, _Analyzer, false, new IndexWriter.MaxFieldLength(1024 * 1024 * 4))) { foreach (var document in documents) { if (document.Id == Guid.Empty) throw new ArgumentOutOfRangeException("Attempt to index transient document: " + document.Title); var doc = new Document(); doc.Add(new Field(CmsDocumentField.Id.ToString(), document.Id.ToString("b"), Field.Store.YES, Field.Index.NOT_ANALYZED)); if (!String.IsNullOrEmpty(document.Title)) doc.Add(new Field(CmsDocumentField.Title.ToString(), document.Title, Field.Store.YES, Field.Index.ANALYZED)); foreach (var tag in document.Tags) { doc.Add(new Field(CmsDocumentField.Tag.ToString(), tag, Field.Store.YES, Field.Index.ANALYZED)); } foreach (var partValue in document.Parts.Select(p => p.Value)) { if(!String.IsNullOrEmpty(partValue)) doc.Add(new Field(CmsDocumentField.Value.ToString(), partValue, Field.Store.NO, Field.Index.ANALYZED)); } writer.AddDocument(doc); } writer.Flush(true, true, true); } }
public void TestMax() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); Document doc = new Document(); doc.Add(new SortedSetDocValuesField("value", new BytesRef("foo"))); doc.Add(new SortedSetDocValuesField("value", new BytesRef("bar"))); doc.Add(NewStringField("id", "1", Field.Store.YES)); writer.AddDocument(doc); doc = new Document(); doc.Add(new SortedSetDocValuesField("value", new BytesRef("baz"))); doc.Add(NewStringField("id", "2", Field.Store.YES)); writer.AddDocument(doc); IndexReader ir = writer.Reader; writer.Dispose(); // slow wrapper does not support random access ordinals (there is no need for that!) IndexSearcher searcher = NewSearcher(ir, false); Sort sort = new Sort(new SortedSetSortField("value", false, Selector.MAX)); TopDocs td = searcher.Search(new MatchAllDocsQuery(), 10, sort); assertEquals(2, td.TotalHits); // 'baz' comes before 'foo' assertEquals("2", searcher.Doc(td.ScoreDocs[0].Doc).Get("id")); assertEquals("1", searcher.Doc(td.ScoreDocs[1].Doc).Get("id")); assertNoFieldCaches(); ir.Dispose(); dir.Dispose(); }
public void CreateIndex(List<ISearchEntity> CreateEntities) { Analyzer analyzer = new StandardAnalyzer(); IndexWriter writer = new IndexWriter(ConfigElement.IndexDirectory, analyzer, true); //第三个参数:是否重新创建索引,True 一律清空 重新建立 False 原有基础上增量添加索引 foreach (ISearchEntity IndexEntity in CreateEntities) { ProductModel product = (ProductModel)IndexEntity; Document doc = new Document(); doc.Add(new Field("productid", Convert.ToString(product.EntityIdentity), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("productname", Convert.ToString(product.ProductName), Field.Store.YES, Field.Index.TOKENIZED)); doc.Add(new Field("cateid", Convert.ToString(product.CategoryID), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("catepath", Convert.ToString(product.CategoryPath), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("keywords", Convert.ToString(product.Keywords), Field.Store.YES, Field.Index.TOKENIZED)); doc.Add(new Field("description", Convert.ToString(product.Description), Field.Store.YES, Field.Index.TOKENIZED)); doc.Add(new Field("price", Convert.ToString(product.Price), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("createtime", Convert.ToString(product.CreateTime), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("updatetime", Convert.ToString(product.UpdateTime), Field.Store.YES, Field.Index.UN_TOKENIZED)); doc.Add(new Field("mainimage", Convert.ToString(product.ProductImage), Field.Store.YES, Field.Index.UN_TOKENIZED)); writer.AddDocument(doc); Console.WriteLine("created index for {0}:{1}", product.EntityIdentity, product.ProductName); } writer.Optimize(); writer.Close(); }
private void AddDoc(IndexWriter iw, int i) { Document d = new Document(); IFieldable f; int scoreAndID = i + 1; f = new Field(ID_FIELD, Id2String(scoreAndID), Field.Store.YES, Field.Index.NOT_ANALYZED); // for debug purposes f.OmitNorms = true; d.Add(f); f = new Field(TEXT_FIELD, "text of doc" + scoreAndID + TextLine(i), Field.Store.NO, Field.Index.ANALYZED); // for regular search f.OmitNorms = true; d.Add(f); f = new Field(INT_FIELD, "" + scoreAndID, Field.Store.NO, Field.Index.NOT_ANALYZED); // for function scoring f.OmitNorms = true; d.Add(f); f = new Field(FLOAT_FIELD, scoreAndID + ".000", Field.Store.NO, Field.Index.NOT_ANALYZED); // for function scoring f.OmitNorms = true; d.Add(f); iw.AddDocument(d); Log("added: " + d); }
private void AddDoc(IndexWriter writer, String name, String id) { Document doc = new Document(); doc.Add(new Field("name", name, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("id", id, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); }
// This method indexes the given text. private static void AddToIndex(int id, string text, IndexWriter writer) { Term term = new Term("id", id.ToString()); Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(new Field("id", id.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("mainText", text, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); writer.UpdateDocument(term, doc); }
public void CreateIndex(Analyzer analayer) { FSDirectory fsDir = new SimpleFSDirectory(new DirectoryInfo(_indexerFolder)); IndexWriter indexWriter = new IndexWriter(fsDir, analayer, true, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); Stopwatch stopWatch = Stopwatch.StartNew(); int analyzedCount = 0; string[] files = System.IO.Directory.GetFiles(_textFilesFolder, this._fileSearchPattern, SearchOption.AllDirectories); //统计需要索引的文件页数 int totalPages = GetTotalPages(files); WriteLog("Total pages statistics takes {0}ms", stopWatch.Elapsed.Milliseconds); stopWatch.Restart(); TextAbsorber textAbsorber = new TextAbsorber(); //开始索引 foreach (string pdfFile in files) { var fileInfo = new FileInfo(pdfFile); var fileName = fileInfo.Name; Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(pdfFile); WriteLog("Current file is {0}", pdfFile); //注意pdf页码从1开始 for (int i = 1; i <= pdfDocument.Pages.Count; i++) { Page page = pdfDocument.Pages[i]; page.Accept(textAbsorber); string pageContent = textAbsorber.Text; Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(new Field(LuceneConfig.Field_Path, pdfFile, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(LuceneConfig.Field_FileName, fileName, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(LuceneConfig.Field_PageNumber, i.ToString(), Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(LuceneConfig.Field_ContentByPage, pageContent, Field.Store.NO, Field.Index.ANALYZED)); indexWriter.AddDocument(doc); analyzedCount++; RaiseProgressChanged(analyzedCount * 100 / totalPages); } } indexWriter.Optimize(); indexWriter.Dispose(); stopWatch.Stop(); Console.WriteLine("All completed. It takes {0}ms", stopWatch.Elapsed); }
private void AddNoProxDoc(IndexWriter writer) { Document doc = new Document(); Field f = new Field("content3", "aaa", Field.Store.YES, Field.Index.ANALYZED); f.OmitTermFreqAndPositions = true; doc.Add(f); f = new Field("content4", "aaa", Field.Store.YES, Field.Index.NO); f.OmitTermFreqAndPositions = true; doc.Add(f); writer.AddDocument(doc); }
private void AddDoc(IndexWriter writer, int id) { Document doc = new Document(); doc.Add(new Field("content", "aaa", Field.Store.NO, Field.Index.ANALYZED)); doc.Add(new Field("id", System.Convert.ToString(id), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("autf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("fie\u2C77ld", "field with non-ascii name", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); /* This was used in 2.9 to generate an index with compressed field: * if (id % 2 == 0) * { * doc.Add(new Field("compressed", TEXT_TO_COMPRESS, Field.Store.COMPRESS, Field.Index.NOT_ANALYZED)); * doc.Add(new Field("compressedSize", System.Convert.ToString(TEXT_COMPRESSED_LENGTH), Field.Store.YES, Field.Index.NOT_ANALYZED)); * } * else * { * doc.Add(new Field("compressed", BINARY_TO_COMPRESS, Field.Store.COMPRESS)); * doc.Add(new Field("compressedSize", System.Convert.ToString(BINARY_COMPRESSED_LENGTH), Field.Store.YES, Field.Index.NOT_ANALYZED)); * }*/ // Add numeric fields, to test if flex preserves encoding doc.Add(new NumericField("trieInt", 4).SetIntValue(id)); doc.Add(new NumericField("trieLong", 4).SetLongValue(id)); writer.AddDocument(doc); }
/// <summary> /// 添加单个索引数据 /// </summary> /// <param name="data"></param> /// <param name="writer"></param> private static void AddToLuceneIndex(SampleData data, IndexWriter writer) { var searchQuery = new TermQuery(new Term("Id", data.Id.ToString())); writer.DeleteDocuments(searchQuery); var doc = new Lucene.Net.Documents.Document(); doc.Add(new Field("Id", data.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("Name", data.Name, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Description", data.Description, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); }
public void AddDocumentPage(bool is_deleted, string fingerprint, int page, string content) { Lucene.Net.Documents.Document document = null; // Create the document only if it is not to be deleted if (!is_deleted) { document = new Lucene.Net.Documents.Document(); document.Add(new Field("fingerprint", fingerprint, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); document.Add(new Field("page", Convert.ToString(page), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); document.Add(new Field("content", content, Field.Store.NO, Field.Index.ANALYZED)); } AddDocumentPage_INTERNAL(fingerprint, page, document); }
public Document Create(SearchItem searchItem) { var document = new Lucene.Net.Documents.Document(); foreach (var keyword in searchItem.Keywords) { document.Add(this.CreateKeywordField(Constants.Search.Keyword, keyword)); } document.Add(this.CreateStoredField(Constants.Search.EntityID, searchItem.EntityID.ToString())); document.Add(this.CreateIndexedTextField(Constants.Search.Abstract, searchItem.Abstract)); document.Add(this.CreateIndexedTextField(Constants.Search.Text, searchItem.Text)); return(document); }
public virtual void TestTokenReuse() { Analyzer analyzer = new AnonymousClassAnalyzer1(this); IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("f1", "a 5 a a", Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); writer.Commit(); SegmentInfo info = writer.NewestSegment(); writer.Close(); SegmentReader reader = SegmentReader.Get(true, info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); TermPositions termPositions = reader.TermPositions(new Term("f1", "a")); Assert.IsTrue(termPositions.Next()); int freq = termPositions.Freq; Assert.AreEqual(3, freq); Assert.AreEqual(0, termPositions.NextPosition()); Assert.AreEqual(true, termPositions.IsPayloadAvailable); Assert.AreEqual(6, termPositions.NextPosition()); Assert.AreEqual(false, termPositions.IsPayloadAvailable); Assert.AreEqual(7, termPositions.NextPosition()); Assert.AreEqual(false, termPositions.IsPayloadAvailable); }
// Indexing... public void IndexText(List <Collection> collections) { foreach (Collection c in collections) { Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); // TODO: Enter code to index text Lucene.Net.Documents.Field field_DocID = new Lucene.Net.Documents.Field("DocID", c.DocID, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.Add(field_DocID); Lucene.Net.Documents.Field field_Title = new Lucene.Net.Documents.Field("Title", c.Title, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.Add(field_Title); doc.Add(new Lucene.Net.Documents.Field("Author", c.Author, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Lucene.Net.Documents.Field("Bibliographic", c.Bibliographic, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Lucene.Net.Documents.Field("Words", c.Words, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); } }
/// <summary> Adds the fields above to a document </summary> /// <param name="doc">The document to write /// </param> public static void SetupDoc(Document doc) { for (int i = 0; i < Fields.Length; i++) { doc.Add(Fields[i]); } }
public void MrsJones() { var dir = new RAMDirectory(); var analyzer = new LowerCaseKeywordAnalyzer(); var writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); var document = new Lucene.Net.Documents.Document(); document.Add(new Field("Name", "MRS. SHABA", Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)); writer.AddDocument(document); writer.Close(true); var searcher = new IndexSearcher(dir, true); var termEnum = searcher.GetIndexReader().Terms(); while (termEnum.Next()) { var buffer = termEnum.Term().Text(); Console.WriteLine(buffer); } var queryParser = new RangeQueryParser(Version.LUCENE_29, "", analyzer); var query = queryParser.Parse("Name:\"MRS. S*\""); Console.WriteLine(query); var result = searcher.Search(query, 10); Assert.NotEqual(0, result.TotalHits); }
void Index() { Lucene.Net.Index.IndexWriter wr = new Lucene.Net.Index.IndexWriter(dir, new Lucene.Net.Analysis.WhitespaceAnalyzer(), Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); Lucene.Net.Documents.Document doc = null; Lucene.Net.Documents.Field f = null; doc = new Lucene.Net.Documents.Document(); f = new Lucene.Net.Documents.Field("field", "a b c d", Lucene.Net.Documents.Field.Store.NO, Lucene.Net.Documents.Field.Index.ANALYZED); doc.Add(f); wr.AddDocument(doc); doc = new Lucene.Net.Documents.Document(); f = new Lucene.Net.Documents.Field("field", "a b a d", Lucene.Net.Documents.Field.Store.NO, Lucene.Net.Documents.Field.Index.ANALYZED); doc.Add(f); wr.AddDocument(doc); doc = new Lucene.Net.Documents.Document(); f = new Lucene.Net.Documents.Field("field", "a b e f", Lucene.Net.Documents.Field.Store.NO, Lucene.Net.Documents.Field.Index.ANALYZED); doc.Add(f); wr.AddDocument(doc); doc = new Lucene.Net.Documents.Document(); f = new Lucene.Net.Documents.Field("field", "x y z", Lucene.Net.Documents.Field.Store.NO, Lucene.Net.Documents.Field.Index.ANALYZED); doc.Add(f); wr.AddDocument(doc); wr.Close(); }
public virtual void TestPreAnalyzedField() { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("preanalyzed", new AnonymousClassTokenStream(this), TermVector.NO)); writer.AddDocument(doc); writer.Commit(); SegmentInfo info = writer.NewestSegment(); writer.Close(); SegmentReader reader = SegmentReader.Get(true, info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); TermPositions termPositions = reader.TermPositions(new Term("preanalyzed", "term1")); Assert.IsTrue(termPositions.Next()); Assert.AreEqual(1, termPositions.Freq); Assert.AreEqual(0, termPositions.NextPosition()); termPositions.Seek(new Term("preanalyzed", "term2")); Assert.IsTrue(termPositions.Next()); Assert.AreEqual(2, termPositions.Freq); Assert.AreEqual(1, termPositions.NextPosition()); Assert.AreEqual(3, termPositions.NextPosition()); termPositions.Seek(new Term("preanalyzed", "term3")); Assert.IsTrue(termPositions.Next()); Assert.AreEqual(1, termPositions.Freq); Assert.AreEqual(2, termPositions.NextPosition()); }
// Activity 9 public void IndexText(string text) { // TODO: Enter code to index text Lucene.Net.Documents.Field field = new Lucene.Net.Documents.Field("text", text, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); doc.Add(field); writer.AddDocument(doc); }
private void EnsureWriterHasChanges() { var doc = new Lucene.Net.Documents.Document(); var field = new Lucene.Net.Documents.Field("Path", "/root/indexing_writinggapandgettingunprocessedactivitiesswithgap/fake", LucField.Store.YES, LucField.Index.NOT_ANALYZED, LucField.TermVector.NO); doc.Add(field); LuceneManager._writer.AddDocument(doc); }
public void constructor_should_convert_document_and_scoredoc_to_properties_and_parse_createdon_date() { // Arrange LuceneDocument document = new LuceneDocument(); document.Add(CreateField("id", "123")); document.Add(CreateField("title", "the title")); document.Add(CreateField("contentsummary", "the summary")); document.Add(CreateField("tags", "tag1 tag2")); document.Add(CreateField("createdby", "gandhi")); document.Add(CreateField("contentlength", "999")); document.Add(CreateField("createdon", DateTime.Today.ToString())); ScoreDoc scoreDoc = new ScoreDoc(0, 9.50f); // Act SearchResultViewModel model = new SearchResultViewModel(document, scoreDoc); // Assert Assert.That(model.Id, Is.EqualTo(123)); Assert.That(model.Title, Is.EqualTo("the title")); Assert.That(model.ContentSummary, Is.EqualTo("the summary")); Assert.That(model.Tags, Is.EqualTo("tag1 tag2")); Assert.That(model.CreatedBy, Is.EqualTo("gandhi")); Assert.That(model.ContentLength, Is.EqualTo(999)); Assert.That(model.CreatedOn, Is.EqualTo(DateTime.Today)); // only the date should be parsed Assert.That(model.Score, Is.EqualTo(9.50f)); }
/// <summary> /// Indexes the document. /// </summary> /// <typeparam name="T"></typeparam> /// <param name="document">The document.</param> /// <param name="indexName">Name of the index.</param> /// <param name="mappingType">Type of the mapping.</param> public override void IndexDocument <T>(T document, string indexName = null, string mappingType = null) { try { Type documentType = document.GetType(); if (indexName == null) { indexName = documentType.Name.ToLower(); } if (mappingType == null) { mappingType = documentType.Name.ToLower(); } if (!_indexes.ContainsKey(mappingType)) { CreateIndex(documentType); } var index = _indexes[mappingType]; Document doc = new Document(); foreach (var typeMappingProperty in index.MappingProperties.Values) { TextField textField = new TextField(typeMappingProperty.Name, documentType.GetProperty(typeMappingProperty.Name).GetValue(document, null).ToStringSafe().ToLower(), global::Lucene.Net.Documents.Field.Store.YES); textField.Boost = typeMappingProperty.Boost; doc.Add(textField); } IndexModelBase docIndexModelBase = document as IndexModelBase; string indexValue = LuceneID(mappingType, docIndexModelBase.Id); doc.AddStringField("type", mappingType, global::Lucene.Net.Documents.Field.Store.YES); doc.AddStringField("id", docIndexModelBase.Id.ToString(), global::Lucene.Net.Documents.Field.Store.YES); doc.AddStringField("index", indexValue, global::Lucene.Net.Documents.Field.Store.YES); // Stores all the properties as JSON to retrieve object on lookup. doc.AddStoredField("JSON", document.ToJson()); // Use the analyzer in fieldAnalyzers if that field is in that dictionary, otherwise use StandardAnalyzer. var analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer: new StandardAnalyzer(_matchVersion, new CharArraySet(_matchVersion, 0, true)), fieldAnalyzers: index.FieldAnalyzers); OpenWriter(); lock ( _lockWriter ) { if (_writer != null) { _writer.UpdateDocument(new Term("index", indexValue), doc, analyzer); // Must specify analyzer because the default analyzer that is specified in indexWriterConfig is null. } } } catch (Exception ex) { HttpContext context2 = HttpContext.Current; ExceptionLogService.LogException(ex, context2); } }
private static void _addToLuceneIndex(SampleData sampleData, IndexWriter writer) { // remove older index entry var searchQuery = new Lucene.Net.Search.TermQuery(new Term("Id", sampleData.Id.ToString())); writer.DeleteDocuments(searchQuery); // add new index entry var doc = new Lucene.Net.Documents.Document(); // add lucene fields mapped to db fields doc.Add(new Field("Id", sampleData.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("Name", sampleData.Name, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("Description", sampleData.Description, Field.Store.YES, Field.Index.ANALYZED)); // add entry to index writer.AddDocument(doc); }
public Lucene.Net.Documents.Document CreateDocWith(string fileContent) { Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); string[] tags = { ".I", "\n.T\n", "\n.A\n", "\n.B\n", "\n.W\n" }; string[] splitedContentWithTags = fileContent.Split(tags, StringSplitOptions.None); // edit indexing method here doc.Add(new Field(DOCID_FN, splitedContentWithTags[0], Field.Store.NO, Field.Index.NO)); doc.Add(new Field(TITLE_FN, splitedContentWithTags[1], Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(AUTHOR_FN, splitedContentWithTags[2], Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field(BIBLIOGRAPHICINFORMATION_FN, splitedContentWithTags[4].Replace(splitedContentWithTags[1] + "\n", ""), // remove title from abstract Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field(ABSTRACT_FN, splitedContentWithTags[4], Field.Store.YES, Field.Index.ANALYZED)); return(doc); }
private SegmentInfo IndexDoc(IndexWriter writer, System.String fileName) { System.IO.FileInfo file = new System.IO.FileInfo(System.IO.Path.Combine(workDir.FullName, fileName)); Document doc = FileDocument.Document(file); doc.Add(new Field("contents", new System.IO.StreamReader(file.FullName))); writer.AddDocument(doc); writer.Commit(); return(writer.NewestSegment()); }
public void CreateIndex() { var indexDirectory = FSDirectory.Open(new System.IO.DirectoryInfo(Settings.IndexLocation)); var stdAnalyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); var startTime = DateTime.Now; Console.WriteLine("Indexing Started at " + startTime.ToString()); try { using (var indexWriter = new IndexWriter(indexDirectory, stdAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED)) { var files = System.IO.Directory.GetFiles(Settings.DataFileLocation, "*.pdf", System.IO.SearchOption.AllDirectories); foreach (var file in files) { using (var reader = new iTextSharp.text.pdf.PdfReader(file)) { var text = new StringBuilder(); var totPages = reader.NumberOfPages; for (int pageNo = 1; pageNo <= totPages; pageNo++) { var document = new Lucene.Net.Documents.Document(); text.Append(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, pageNo)); document.Add(new Field("file", file, Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field("pageno", pageNo.ToString(), Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field("content", text.ToString(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); indexWriter.AddDocument(document); indexWriter.Optimize(); } } } } } catch (Exception ex) { Console.WriteLine("Failed to Index {0}", ex.StackTrace.ToString()); } var endTime = DateTime.Now; Console.WriteLine("Indexing Completed at " + endTime.ToString()); }