public override void Build(IInputEnumerator enumerator) { if (m_searcherMgr != null) { m_searcherMgr.Dispose(); m_searcherMgr = null; } if (writer != null) { writer.Dispose(); writer = null; } AtomicReader r = null; bool success = false; try { // First pass: build a temporary normal Lucene index, // just indexing the suggestions as they iterate: writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GetGramAnalyzer(), OpenMode.CREATE)); //long t0 = System.nanoTime(); // TODO: use threads? BytesRef text; while (enumerator.MoveNext()) { text = enumerator.Current; BytesRef payload; if (enumerator.HasPayloads) { payload = enumerator.Payload; } else { payload = null; } Add(text, enumerator.Contexts, enumerator.Weight, payload); } //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec"); m_searcherMgr = new SearcherManager(writer, true, null); success = true; } finally { if (success) { IOUtils.Dispose(r); } else { IOUtils.DisposeWhileHandlingException(writer, r); writer = null; } } }
public void TestWithValueSource() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new DoubleConstValueSource(10), PAYLOAD_FIELD_NAME); IInputEnumerator inputIterator = dictionary.GetEntryEnumerator(); while (inputIterator.MoveNext()) { string field = inputIterator.Current.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, 10); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
public void TestFileWithDifferentDelimiter() { KeyValuePair <IList <IList <string> >, string> fileInput = generateFileInput(AtLeast(100), " , ", true, true); Stream inputReader = new MemoryStream(fileInput.Value.getBytes(Encoding.UTF8)); FileDictionary dictionary = new FileDictionary(inputReader, " , "); IList <IList <string> > entries = fileInput.Key; IInputEnumerator inputIter = dictionary.GetEntryEnumerator(); assertTrue(inputIter.HasPayloads); int count = 0; while (inputIter.MoveNext()) { assertTrue(entries.size() > count); IList <string> entry = entries[count]; assertTrue(entry.size() >= 2); // at least term and weight assertEquals(entry[0], inputIter.Current.Utf8ToString()); assertEquals(long.Parse(entry[1], CultureInfo.InvariantCulture), inputIter.Weight); if (entry.size() == 3) { assertEquals(entry[2], inputIter.Payload.Utf8ToString()); } else { assertEquals(inputIter.Payload.Length, 0); } count++; } assertEquals(count, entries.size()); }
/// <summary> /// Creates a new iterator, buffering entries from the specified iterator </summary> public BufferedInputEnumerator(IInputEnumerator source) { int freqIndex = 0; hasPayloads = source.HasPayloads; hasContexts = source.HasContexts; while (source.MoveNext()) { m_entries.Append(source.Current); if (hasPayloads) { m_payloads.Append(source.Payload); } if (hasContexts) { m_contextSets.Add(source.Contexts); } if (freqIndex >= m_freqs.Length) { m_freqs = ArrayUtil.Grow(m_freqs, m_freqs.Length + 1); } m_freqs[freqIndex++] = source.Weight; } comp = source.Comparer; }
public override void Build(IInputEnumerator enumerator) { if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } root = new TernaryTreeNode(); // buffer first #pragma warning disable 612, 618 if (enumerator.Comparer != BytesRef.UTF8SortedAsUTF16Comparer) { // make sure it's sorted and the comparer uses UTF16 sort order enumerator = new SortedInputEnumerator(enumerator, BytesRef.UTF8SortedAsUTF16Comparer); } #pragma warning restore 612, 618 List <string> tokens = new List <string>(); List <object> vals = new List <object>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while (enumerator.MoveNext()) { spare = enumerator.Current; charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); tokens.Add(charsSpare.ToString()); vals.Add(enumerator.Weight); } autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root); }
internal WFSTInputEnumerator(IInputEnumerator source) : base(source) { if (Debugging.AssertsEnabled) { Debugging.Assert(source.HasPayloads == false); } }
public XoGame(GraphicsDevice device, GameControler gameController, Rectangle rect, ContentManager content, IInputEnumerator input) { this.gameController = gameController; this.screenRect = rect; this.content = content; this.device = device; this.inputEnumerator = input; gameLayers = new LayersCollection(); scrollLayer = new ScrollLayer(gameLayers, screenRect); }
/// <summary> /// Creates a new sorted wrapper, sorting by BytesRef /// (ascending) then cost (ascending). /// </summary> public SortedInputEnumerator(IInputEnumerator source, IComparer <BytesRef> comparer) { this.tieBreakByCostComparer = Comparer <BytesRef> .Create((left, right) => { BytesRef leftScratch = new BytesRef(); BytesRef rightScratch = new BytesRef(); ByteArrayDataInput input = new ByteArrayDataInput(); // Make shallow copy in case decode changes the BytesRef: leftScratch.Bytes = left.Bytes; leftScratch.Offset = left.Offset; leftScratch.Length = left.Length; rightScratch.Bytes = right.Bytes; rightScratch.Offset = right.Offset; rightScratch.Length = right.Length; long leftCost = Decode(leftScratch, input); long rightCost = Decode(rightScratch, input); if (HasPayloads) { DecodePayload(leftScratch, input); DecodePayload(rightScratch, input); } if (HasContexts) { DecodeContexts(leftScratch, input); DecodeContexts(rightScratch, input); } // LUCENENET NOTE: outerInstance.Comparer != outerInstance.comparer!! int cmp = this.comparer.Compare(leftScratch, rightScratch); if (cmp != 0) { return(cmp); } if (leftCost < rightCost) { return(-1); } else if (leftCost > rightCost) { return(1); } else { return(0); } }); this.hasPayloads = source.HasPayloads; this.hasContexts = source.HasContexts; this.source = source; this.comparer = comparer; this.reader = Sort(); }
public void TestWithContexts() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); KeyValuePair <IList <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), true, true); IDictionary <string, Document> docs = res.Value; IList <string> invalidDocTerms = res.Key; foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME, CONTEXT_FIELD_NAME); IInputEnumerator inputIterator = dictionary.GetEntryEnumerator(); while (inputIterator.MoveNext()) { string field = inputIterator.Current.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); //Document doc = docs.remove(f.utf8ToString()); assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME)))); IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME); assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); ISet <BytesRef> oriCtxs = new JCG.HashSet <BytesRef>(); ICollection <BytesRef> contextSet = inputIterator.Contexts; foreach (IIndexableField ctxf in doc.GetFields(CONTEXT_FIELD_NAME)) { oriCtxs.add(ctxf.GetBinaryValue()); } assertEquals(oriCtxs.size(), contextSet.Count); } foreach (string invalidTerm in invalidDocTerms) { var invalid = docs[invalidTerm]; docs.Remove(invalidTerm); assertNotNull(invalid); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
public void TestWithContext() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); ValueSource[] toAdd = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) }; IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME); IInputEnumerator inputIterator = dictionary.GetEntryEnumerator(); while (inputIterator.MoveNext()) { string field = inputIterator.Current.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault(); long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault(); long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault(); assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, (w1 + w2 + w3)); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); // LUCENENET NOTE: This test was once failing because we used SCG.HashSet<T> whose // Equals() implementation does not check for set equality. As a result SortedInputEnumerator // had been modified to reverse the results to get the test to pass. However, using JCG.HashSet<T> // ensures that set equality (that is equality that doesn't care about order of items) is respected. // SortedInputEnumerator has also had the specific sorting removed. ISet <BytesRef> originalCtxs = new JCG.HashSet <BytesRef>(); foreach (IIndexableField ctxf in doc.GetFields(CONTEXTS_FIELD_NAME)) { originalCtxs.add(ctxf.GetBinaryValue()); } assertEquals(originalCtxs, inputIterator.Contexts); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
public void TestWithoutPayload() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); KeyValuePair <IList <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), false, false); IDictionary <string, Document> docs = res.Value; IList <string> invalidDocTerms = res.Key; foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME); IInputEnumerator inputIterator = dictionary.GetEntryEnumerator(); while (inputIterator.MoveNext()) { var field = inputIterator.Current.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME)))); IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME); assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0); assertEquals(inputIterator.Payload, null); } foreach (string invalidTerm in invalidDocTerms) { var invalid = docs[invalidTerm]; docs.Remove(invalidTerm); assertNotNull(invalid); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
public override void Build(IInputEnumerator enumerator) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } count = 0; BytesRef scratch; IInputEnumerator iter = new WFSTInputEnumerator(enumerator); var scratchInts = new Int32sRef(); BytesRef previous = null; var outputs = PositiveInt32Outputs.Singleton; var builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); while (iter.MoveNext()) { scratch = iter.Current; long cost = iter.Weight; if (previous == null) { previous = new BytesRef(); } else if (scratch.Equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Lucene.Net.Util.Fst.Util.ToInt32sRef(scratch, scratchInts); builder.Add(scratchInts, cost); previous.CopyBytes(scratch); count++; } fst = builder.Finish(); }
/// <summary> /// Creates a new iterator, wrapping the specified iterator and /// returning elements in a random order. /// </summary> public UnsortedInputEnumerator(IInputEnumerator source) : base(source) { ords = new int[m_entries.Length]; Random random = new J2N.Randomizer(); for (int i = 0; i < ords.Length; i++) { ords[i] = i; } for (int i = 0; i < ords.Length; i++) { int randomPosition = random.Next(ords.Length); int temp = ords[i]; ords[i] = ords[randomPosition]; ords[randomPosition] = temp; } }
public void TestWithContext() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); ValueSource[] toAdd = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) }; IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME); IInputEnumerator inputIterator = dictionary.GetEntryEnumerator(); while (inputIterator.MoveNext()) { string field = inputIterator.Current.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault(); long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault(); long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault(); assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, (w1 + w2 + w3)); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); ISet <BytesRef> originalCtxs = new JCG.HashSet <BytesRef>(); foreach (IIndexableField ctxf in doc.GetFields(CONTEXTS_FIELD_NAME)) { originalCtxs.add(ctxf.GetBinaryValue()); } assertEquals(originalCtxs, inputIterator.Contexts); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
public override void Build(IInputEnumerator enumerator) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.Comparer != null) { // make sure it's unsorted // WTF - this could result in yet another sorted iteration.... enumerator = new UnsortedInputEnumerator(enumerator); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } count = 0; trie = new JaspellTernarySearchTrie { MatchAlmostDiff = editDistance }; BytesRef spare; var charsSpare = new CharsRef(); while (enumerator.MoveNext()) { spare = enumerator.Current; long weight = enumerator.Weight; if (spare.Length == 0) { continue; } charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); trie.Put(charsSpare.ToString(), weight); } }
public void TestEmptyReader() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); // Make sure the index is created? RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new DoubleConstValueSource(10), PAYLOAD_FIELD_NAME); IInputEnumerator inputIterator = dictionary.GetEntryEnumerator(); assertFalse(inputIterator.MoveNext()); assertEquals(inputIterator.Weight, 0); assertNull(inputIterator.Payload); ir.Dispose(); dir.Dispose(); }
public void TestFileWithTerm() { KeyValuePair <IList <IList <string> >, string> fileInput = generateFileInput(AtLeast(100), FileDictionary.DEFAULT_FIELD_DELIMITER, false, false); Stream inputReader = new MemoryStream(fileInput.Value.getBytes(Encoding.UTF8)); FileDictionary dictionary = new FileDictionary(inputReader); IList <IList <string> > entries = fileInput.Key; IInputEnumerator inputIter = dictionary.GetEntryEnumerator(); assertFalse(inputIter.HasPayloads); int count = 0; while (inputIter.MoveNext()) { assertTrue(entries.size() > count); IList <string> entry = entries[count]; assertTrue(entry.size() >= 1); // at least a term assertEquals(entry[0], inputIter.Current.Utf8ToString()); assertEquals(1, inputIter.Weight); assertNull(inputIter.Payload); count++; } assertEquals(count, entries.size()); }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputEnumerator enumerator, double ramBufferSizeMB) { if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // LUCENENET specific - using GetRandomFileName() instead of picking a random int DirectoryInfo tempIndexPath = null; while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName()))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); try { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (enumerator.MoveNext()) { BytesRef surfaceForm = enumerator.Current; field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetEnumerator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (termsEnum.MoveNext()) { BytesRef term = termsEnum.Term; int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst == null) { throw new ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } } finally { try { IOUtils.Dispose(dir); } finally { // LUCENENET specific - since we are removing the entire directory anyway, // it doesn't make sense to first do a loop in order remove the files. // Let the System.IO.Directory.Delete() method handle that. // We also need to dispose the Directory instance first before deleting from disk. try { System.IO.Directory.Delete(tempIndexPath.FullName, true); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + tempIndexPath, e); } } } }
public override void Build(IInputEnumerator enumerator) { Build(enumerator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB); }
public void TestWithDeletions() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); KeyValuePair <IList <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), false, false); IDictionary <string, Document> docs = res.Value; IList <String> invalidDocTerms = res.Key; Random rand = Random; IList <string> termsToDel = new JCG.List <string>(); foreach (Document doc in docs.Values) { IIndexableField f2 = doc.GetField(FIELD_NAME); if (rand.nextBoolean() && f2 != null && !invalidDocTerms.Contains(f2.GetStringValue())) { termsToDel.Add(doc.Get(FIELD_NAME)); } writer.AddDocument(doc); } writer.Commit(); Term[] delTerms = new Term[termsToDel.size()]; for (int i = 0; i < termsToDel.size(); i++) { delTerms[i] = new Term(FIELD_NAME, termsToDel[i]); } foreach (Term delTerm in delTerms) { writer.DeleteDocuments(delTerm); } writer.Commit(); writer.Dispose(); foreach (string termToDel in termsToDel) { var toDel = docs[termToDel]; assertTrue(toDel != null); docs.Remove(termToDel); } IndexReader ir = DirectoryReader.Open(dir); assertEquals(ir.NumDocs, docs.size()); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME); IInputEnumerator inputIterator = dictionary.GetEntryEnumerator(); while (inputIterator.MoveNext()) { var field = inputIterator.Current.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME)))); IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME); assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0); assertEquals(inputIterator.Payload, null); } foreach (string invalidTerm in invalidDocTerms) { var invalid = docs[invalidTerm]; docs.Remove(invalidTerm); assertNotNull(invalid); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
/// <summary> /// Builds up a new internal <see cref="Lookup"/> representation based on the given <see cref="IInputEnumerator"/>. /// The implementation might re-sort the data internally. /// </summary> public abstract void Build(IInputEnumerator inputEnumerator);
public override void Build(IInputEnumerator enumerator) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = Arrays.Empty <byte>(); ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while (enumerator.MoveNext()) { spare = enumerator.Current; if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt32(EncodeWeight(enumerator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt32(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer, sorter); } else { IOUtils.DisposeWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
/// <summary> /// Creates a new sorted wrapper, using <see cref="BytesRef.UTF8SortedAsUnicodeComparer"/> /// for sorting. /// </summary> public SortedInputEnumerator(IInputEnumerator source) : this(source, BytesRef.UTF8SortedAsUnicodeComparer) { }
public override void Build(IInputEnumerator enumerator) { if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = enumerator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while (enumerator.MoveNext()) { surfaceForm = enumerator.Current; ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = enumerator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(enumerator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } if (Debugging.AssertsEnabled) { Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength); } writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
public void TestWithDeletions() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); Random rand = Random; IList <string> termsToDel = new JCG.List <string>(); foreach (Document doc in docs.Values) { if (rand.nextBoolean() && termsToDel.size() < docs.size() - 1) { termsToDel.Add(doc.Get(FIELD_NAME)); } writer.AddDocument(doc); } writer.Commit(); Term[] delTerms = new Term[termsToDel.size()]; for (int i = 0; i < termsToDel.size(); i++) { delTerms[i] = new Term(FIELD_NAME, termsToDel[i]); } foreach (Term delTerm in delTerms) { writer.DeleteDocuments(delTerm); } writer.Commit(); writer.Dispose(); foreach (string termToDel in termsToDel) { var toDel = docs[termToDel]; docs.Remove(termToDel); assertTrue(null != toDel); } IndexReader ir = DirectoryReader.Open(dir); assertTrue("NumDocs should be > 0 but was " + ir.NumDocs, ir.NumDocs > 0); assertEquals(ir.NumDocs, docs.size()); ValueSource[] toAdd = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2) }; IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME); IInputEnumerator inputIterator = dictionary.GetEntryEnumerator(); while (inputIterator.MoveNext()) { string field = inputIterator.Current.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault(); long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault(); assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, w2 + w1); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }