public override void OnIndexEntryCreated(string entryKey, Lucene.Net.Documents.Document document) { lock (parent.DataTable) { parent.DataTable.Rows.Add(entryKey, document.GetField("Project").StringValue); } }
public virtual void TestRAMDirectoryString() { MockRAMDirectory ramDir = new MockRAMDirectory(indexDir.FullName); // Check size Assert.AreEqual(ramDir.SizeInBytes(), ramDir.GetRecomputedSizeInBytes()); // open reader to test document count IndexReader reader = IndexReader.Open(ramDir); Assert.AreEqual(docsToAdd, reader.NumDocs()); // open search zo check if all doc's are there IndexSearcher searcher = new IndexSearcher(reader); // search for all documents for (int i = 0; i < docsToAdd; i++) { Document doc = searcher.Doc(i); Assert.IsTrue(doc.GetField("content") != null); } // cleanup reader.Close(); searcher.Close(); }
protected override unsafe Document DirectGet(Lucene.Net.Documents.Document input, string id, IState state) { var reduceValue = input.GetField(Constants.Documents.Indexing.Fields.ReduceKeyValueFieldName).GetBinaryValue(state); var result = new BlittableJsonReaderObject((byte *)_context.PinObjectAndGetAddress(reduceValue), reduceValue.Length, _context); return(new Document { Data = result }); }
public static void Extract() { if (LuceneService.DirReader != null) { //This is actually pathetic. Please let me leave and enjoy my shitty code, thank you. for (int i = 0; i < LuceneService.DirReader.MaxDoc; i++) { Lucene.Net.Documents.Document document = LuceneService.DirReader.Document(i); CoreDocument coredoc = GetAnnotatedDocument(document.GetField(ProjectInfo.TextFieldKey).GetStringValue()); ExtractNERTags(coredoc, document); //IsQuestionList.Add(document.GetField("id").GetStringValue(), DetectQuestion(coredoc)); if (DetectQuestion(coredoc)) { IsQuestionList.Add(document.GetField("id").GetInt32Value().Value); } ExtractKeyPhrases(coredoc, document.GetField("id").GetInt32Value().Value); System.Console.WriteLine(i); } } }
public virtual void Test() { Assert.IsTrue(dir != null); Assert.IsTrue(fieldInfos != null); try { FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); Assert.IsTrue(reader != null); Assert.IsTrue(reader.Size() == 1); Document doc = reader.Doc(0); Assert.IsTrue(doc != null); Assert.IsTrue(doc.GetField("textField1") != null); Field field = doc.GetField("textField2"); Assert.IsTrue(field != null); Assert.IsTrue(field.IsTermVectorStored() == true); reader.Close(); } catch (System.IO.IOException e) { System.Console.Error.WriteLine(e.StackTrace); Assert.IsTrue(false); } }
protected override unsafe Document DirectGet(Lucene.Net.Documents.Document input, string id, DocumentFields fields, IState state) { var reduceValue = input.GetField(Constants.Documents.Indexing.Fields.ReduceKeyValueFieldName).GetBinaryValue(state); var allocation = _context.GetMemory(reduceValue.Length); UnmanagedWriteBuffer buffer = new UnmanagedWriteBuffer(_context, allocation); buffer.Write(reduceValue, 0, reduceValue.Length); var result = new BlittableJsonReaderObject(allocation.Address, reduceValue.Length, _context, buffer); return(new Document { Data = result }); }
private bool VerifyIndex(Directory directory, int startAt) { bool fail = false; IndexReader reader = IndexReader.Open(directory, true, null); int max = reader.MaxDoc; for (int i = 0; i < max; i++) { Document temp = reader.Document(i, null); //System.out.println("doc "+i+"="+temp.getField("count").StringValue(null)()); //compare the index doc number to the value that it should be if (!temp.GetField("count").StringValue(null).Equals((i + startAt) + "")) { fail = true; System.Console.Out.WriteLine("Document " + (i + startAt) + " is returning document " + temp.GetField("count").StringValue(null)); } } reader.Close(); return(fail); }
private static void ExtractNouns(CoreDocument coredoc, Lucene.Net.Documents.Document document) { List <string> nouns = new List <string>(); for (int i = 0; i < coredoc.sentences().size(); i++) { CoreSentence sent = (CoreSentence)coredoc.sentences().get(i); for (int j = 0; j < sent.tokens().size(); j++) { // Condition: if the word is a noun (posTag starts with "NN") if (sent.posTags() != null && sent.posTags().get(j) != null) { string posTags = sent.posTags().get(j).ToString(); if (posTags.Contains("NN")) { var noun = sent.tokens().get(j).ToString(); noun = noun.Remove(noun.Length - 2); nouns.Add(noun); } } } } NounPhrases.Add(document.GetField("id").GetInt32Value().Value, nouns); }
public virtual void searchIndex(System.String dirName, System.String oldName) { //QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer()); //Query query = parser.parse("handle:1"); dirName = FullDir(dirName); Directory dir = FSDirectory.Open(new System.IO.FileInfo(dirName)); IndexSearcher searcher = new IndexSearcher(dir); IndexReader reader = searcher.GetIndexReader(); _TestUtil.CheckIndex(dir); for (int i = 0; i < 35; i++) { if (!reader.IsDeleted(i)) { Document d = reader.Document(i); System.Collections.IList fields = d.GetFields(); if (!oldName.StartsWith("19.") && !oldName.StartsWith("20.") && !oldName.StartsWith("21.") && !oldName.StartsWith("22.")) { if (d.GetField("content3") == null) { Assert.AreEqual(5, fields.Count); Field f = (Field)d.GetField("id"); Assert.AreEqual("" + i, f.StringValue()); f = (Field)d.GetField("utf8"); Assert.AreEqual("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.StringValue()); f = (Field)d.GetField("autf8"); Assert.AreEqual("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.StringValue()); f = (Field)d.GetField("content2"); Assert.AreEqual("here is more content with aaa aaa aaa", f.StringValue()); f = (Field)d.GetField("fie\u2C77ld"); Assert.AreEqual("field with non-ascii name", f.StringValue()); } } } // Only ID 7 is deleted else { Assert.AreEqual(7, i); } } ScoreDoc[] hits = searcher.Search(new TermQuery(new Term("content", "aaa")), null, 1000).ScoreDocs; // First document should be #21 since it's norm was // increased: Document d2 = searcher.Doc(hits[0].doc); Assert.AreEqual("21", d2.Get("id"), "didn't get the right document first"); TestHits(hits, 34, searcher.GetIndexReader()); if (!oldName.StartsWith("19.") && !oldName.StartsWith("20.") && !oldName.StartsWith("21.") && !oldName.StartsWith("22.")) { // Test on indices >= 2.3 hits = searcher.Search(new TermQuery(new Term("utf8", "\u0000")), null, 1000).ScoreDocs; Assert.AreEqual(34, hits.Length); hits = searcher.Search(new TermQuery(new Term("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne")), null, 1000).ScoreDocs; Assert.AreEqual(34, hits.Length); hits = searcher.Search(new TermQuery(new Term("utf8", "ab\ud917\udc17cd")), null, 1000).ScoreDocs; Assert.AreEqual(34, hits.Length); } searcher.Close(); dir.Close(); }
public void array_is_flat() { var result = _document.GetField("ListOfItems").StringValue(); result.ShouldEqual("One Two"); }
public virtual void TestIndexStoreCombos() { MockRAMDirectory dir = new MockRAMDirectory(); IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); byte[] b = new byte[50]; for (int i = 0; i < 50; i++) b[i] = (byte) (i + 77); Document doc = new Document(); Field f = new Field("binary", b, 10, 17, Field.Store.YES); f.SetTokenStream(new WhitespaceTokenizer(new System.IO.StringReader("doc1field1"))); Field f2 = new Field("string", "value", Field.Store.YES, Field.Index.ANALYZED); f2.SetTokenStream(new WhitespaceTokenizer(new System.IO.StringReader("doc1field2"))); doc.Add(f); doc.Add(f2); w.AddDocument(doc); // add 2 docs to test in-memory merging f.SetTokenStream(new WhitespaceTokenizer(new System.IO.StringReader("doc2field1"))); f2.SetTokenStream(new WhitespaceTokenizer(new System.IO.StringReader("doc2field2"))); w.AddDocument(doc); // force segment flush so we can force a segment merge with doc3 later. w.Commit(); f.SetTokenStream(new WhitespaceTokenizer(new System.IO.StringReader("doc3field1"))); f2.SetTokenStream(new WhitespaceTokenizer(new System.IO.StringReader("doc3field2"))); w.AddDocument(doc); w.Commit(); w.Optimize(); // force segment merge. IndexReader ir = IndexReader.Open(dir); doc = ir.Document(0); f = doc.GetField("binary"); b = f.GetBinaryValue(); Assert.IsTrue(b != null); Assert.AreEqual(17, b.Length, 17); Assert.AreEqual(87, b[0]); Assert.IsTrue(ir.Document(0).GetFieldable("binary").IsBinary()); Assert.IsTrue(ir.Document(1).GetFieldable("binary").IsBinary()); Assert.IsTrue(ir.Document(2).GetFieldable("binary").IsBinary()); Assert.AreEqual("value", ir.Document(0).Get("string")); Assert.AreEqual("value", ir.Document(1).Get("string")); Assert.AreEqual("value", ir.Document(2).Get("string")); // test that the terms were indexed. Assert.IsTrue(ir.TermDocs(new Term("binary", "doc1field1")).Next()); Assert.IsTrue(ir.TermDocs(new Term("binary", "doc2field1")).Next()); Assert.IsTrue(ir.TermDocs(new Term("binary", "doc3field1")).Next()); Assert.IsTrue(ir.TermDocs(new Term("string", "doc1field2")).Next()); Assert.IsTrue(ir.TermDocs(new Term("string", "doc2field2")).Next()); Assert.IsTrue(ir.TermDocs(new Term("string", "doc3field2")).Next()); ir.Close(); dir.Close(); }
public virtual void TestMergeCompressedFields() { System.IO.FileInfo indexDir = new System.IO.FileInfo(System.IO.Path.Combine(SupportClass.AppSettings.Get("tempDir", ""), "mergecompressedfields")); Directory dir = FSDirectory.Open(indexDir); try { for (int i = 0; i < 5; i++) { // Must make a new writer & doc each time, w/ // different fields, so bulk merge of stored fields // cannot run: IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), i == 0, IndexWriter.MaxFieldLength.UNLIMITED); try { w.SetMergeFactor(5); w.SetMergeScheduler(new SerialMergeScheduler()); Document doc = new Document(); doc.Add(new Field("test1", "this is some data that will be compressed this this this", Field.Store.COMPRESS, Field.Index.NO)); doc.Add(new Field("test2", new byte[20], Field.Store.COMPRESS)); doc.Add(new Field("field" + i, "random field", Field.Store.NO, Field.Index.ANALYZED)); w.AddDocument(doc); } finally { w.Close(); } } byte[] cmp = new byte[20]; IndexReader r = IndexReader.Open(dir); try { for (int i = 0; i < 5; i++) { Document doc = r.Document(i); Assert.AreEqual(doc.GetField("test1").StringValue(), "this is some data that will be compressed this this this"); byte[] b = doc.GetField("test2").BinaryValue(); Assert.AreEqual(b.Length, cmp.Length); for (int j = 0; j < b.Length; j++) Assert.AreEqual(b[j], cmp[j]); } } finally { r.Close(); } } finally { dir.Close(); _TestUtil.RmDir(indexDir); } }
public virtual void TestBinaryFieldOffsetLength() { MockRAMDirectory dir = new MockRAMDirectory(); IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); byte[] b = new byte[50]; for (int i = 0; i < 50; i++) b[i] = (byte) (i + 77); Document doc = new Document(); Field f = new Field("binary", b, 10, 17, Field.Store.YES); byte[] bx = f.GetBinaryValue(); Assert.IsTrue(bx != null); Assert.AreEqual(50, bx.Length); Assert.AreEqual(10, f.GetBinaryOffset()); Assert.AreEqual(17, f.GetBinaryLength()); doc.Add(f); w.AddDocument(doc); w.Close(); IndexReader ir = IndexReader.Open(dir); doc = ir.Document(0); f = doc.GetField("binary"); b = f.GetBinaryValue(); Assert.IsTrue(b != null); Assert.AreEqual(17, b.Length, 17); Assert.AreEqual(87, b[0]); ir.Close(); dir.Close(); }
private static void ExtractNERTags(CoreDocument coredoc, Lucene.Net.Documents.Document document) { //I have no clue as to why NER-tagged messages are stored like that. I guess there is some deep idea behind copying the same info over and over again (or, most likely, this is because some documents have more than one sentence. even tho its stil really stupid) if (coredoc != null) { List nerList = coredoc.entityMentions(); if (nerList.size() > 0) { for (int j = 0; j < nerList.size(); j++) { CoreEntityMention em = (CoreEntityMention)nerList.get(j); //Does this need to be a switch case? if (em.entityType() == "DATE") { var datekey = document.GetField("id").GetInt32Value().Value; if (!DateList.ContainsKey(datekey)) { DateList.Add(datekey, em.text()); } else { DateList.TryUpdate(datekey, DateList[datekey] + ", " + em.text()); } } if (em.entityType() == "TIME") { var timekey = document.GetField("id").GetInt32Value().Value; if (!TimeList.ContainsKey(timekey)) { TimeList.Add(timekey, em.text()); } else { TimeList.TryUpdate(timekey, TimeList[timekey] + ", " + em.text()); } } if (em.entityType() == "LOCATION") { var lockey = document.GetField("id").GetInt32Value().Value; if (!LocList.ContainsKey(lockey)) { LocList.Add(lockey, em.text()); } else { LocList.TryUpdate(lockey, LocList[lockey] + ", " + em.text()); } } if (em.entityType() == "ORGANIZATION") { var orgkey = document.GetField("id").GetInt32Value().Value; if (!OrgList.ContainsKey(orgkey)) { OrgList.Add(orgkey, em.text()); } else { OrgList.TryUpdate(orgkey, OrgList[orgkey] + ", " + em.text()); } } if (em.entityType() == "URL") { var urlkey = document.GetField("id").GetInt32Value().Value; if (!URLList.ContainsKey(urlkey)) { URLList.Add(urlkey, em.text()); } else { URLList.TryUpdate(urlkey, OrgList[urlkey] + ", " + em.text()); } } } } } }
public override void OnIndexEntryCreated(string indexName, string entryKey, Lucene.Net.Documents.Document document) { DataTable.Rows.Add(entryKey, document.GetField("Project").StringValue()); }