/// <summary> /// Creates a new iterator, buffering entries from the specified iterator </summary> public BufferedInputIterator(IInputIterator source) { BytesRef spare; int freqIndex = 0; hasPayloads = source.HasPayloads; hasContexts = source.HasContexts; while ((spare = source.Next()) != null) { m_entries.Append(spare); if (hasPayloads) { m_payloads.Append(source.Payload); } if (hasContexts) { m_contextSets.Add(source.Contexts); } if (freqIndex >= m_freqs.Length) { m_freqs = ArrayUtil.Grow(m_freqs, m_freqs.Length + 1); } m_freqs[freqIndex++] = source.Weight; } comp = source.Comparer; }
public void TestFileWithDifferentDelimiter() { KeyValuePair <List <List <string> >, string> fileInput = generateFileInput(AtLeast(100), " , ", true, true); Stream inputReader = new MemoryStream(fileInput.Value.getBytes(Encoding.UTF8)); FileDictionary dictionary = new FileDictionary(inputReader, " , "); List <List <string> > entries = fileInput.Key; IInputIterator inputIter = dictionary.GetEntryIterator(); assertTrue(inputIter.HasPayloads); BytesRef term; int count = 0; while ((term = inputIter.Next()) != null) { assertTrue(entries.size() > count); List <string> entry = entries[count]; assertTrue(entry.size() >= 2); // at least term and weight assertEquals(entry[0], term.Utf8ToString()); assertEquals(long.Parse(entry[1], CultureInfo.InvariantCulture), inputIter.Weight); if (entry.size() == 3) { assertEquals(entry[2], inputIter.Payload.Utf8ToString()); } else { assertEquals(inputIter.Payload.Length, 0); } count++; } assertEquals(count, entries.size()); }
/// <summary> /// Creates a new iterator, buffering entries from the specified iterator </summary> public BufferedInputIterator(IInputIterator source) { BytesRef spare; int freqIndex = 0; hasPayloads = source.HasPayloads; hasContexts = source.HasContexts; while ((spare = source.Next()) != null) { entries.Append(spare); if (hasPayloads) { payloads.Append(source.Payload); } if (hasContexts) { contextSets.Add(source.Contexts); } if (freqIndex >= freqs.Length) { freqs = ArrayUtil.Grow(freqs, freqs.Length + 1); } freqs[freqIndex++] = source.Weight; } comp = source.Comparator; }
public void TestWithValueSource() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new DoubleConstValueSource(10), PAYLOAD_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { string field = f.Utf8ToString(); Document doc = docs.ContainsKey(field) ? docs[field] : null; docs.Remove(field); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, 10); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); } assertTrue(!docs.Any()); ir.Dispose(); dir.Dispose(); }
public override void Build(IInputIterator iter) { if (m_searcherMgr != null) { m_searcherMgr.Dispose(); m_searcherMgr = null; } if (writer != null) { writer.Dispose(); writer = null; } AtomicReader r = null; bool success = false; try { // First pass: build a temporary normal Lucene index, // just indexing the suggestions as they iterate: writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GramAnalyzer, OpenMode.CREATE)); //long t0 = System.nanoTime(); // TODO: use threads? BytesRef text; while ((text = iter.Next()) != null) { BytesRef payload; if (iter.HasPayloads) { payload = iter.Payload; } else { payload = null; } Add(text, iter.Contexts, iter.Weight, payload); } //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec"); m_searcherMgr = new SearcherManager(writer, true, null); success = true; } finally { if (success) { IOUtils.Close(r); } else { IOUtils.CloseWhileHandlingException(writer, r); writer = null; } } }
public void TestWithContexts() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc); KeyValuePair <List <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), true, true); IDictionary <string, Document> docs = res.Value; List <string> invalidDocTerms = res.Key; foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME, CONTEXT_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { string field = f.Utf8ToString(); Document doc = docs.ContainsKey(field) ? docs[field] : null; docs.Remove(field); //Document doc = docs.remove(f.utf8ToString()); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME); assertEquals(inputIterator.Weight, (weightField != null) ? Convert.ToInt64(weightField.GetNumericValue()) : 0); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); ISet <BytesRef> oriCtxs = new HashSet <BytesRef>(); IEnumerable <BytesRef> contextSet = inputIterator.Contexts; foreach (IIndexableField ctxf in doc.GetFields(CONTEXT_FIELD_NAME)) { oriCtxs.add(ctxf.GetBinaryValue()); } assertEquals(oriCtxs.size(), contextSet.Count()); } foreach (string invalidTerm in invalidDocTerms) { var invalid = docs[invalidTerm]; docs.Remove(invalidTerm); assertNotNull(invalid); } assertTrue(!docs.Any()); ir.Dispose(); dir.Dispose(); }
private OfflineSorter.ByteSequencesReader Sort() { string prefix = this.GetType().Name; DirectoryInfo directory = OfflineSorter.DefaultTempDir(); tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); var writer = new OfflineSorter.ByteSequencesWriter(tempInput); bool success = false; try { BytesRef spare; byte[] buffer = new byte[0]; var output = new ByteArrayDataOutput(buffer); while ((spare = source.Next()) != null) { Encode(writer, output, buffer, spare, source.Payload, source.Contexts, source.Weight); } writer.Dispose(); (new OfflineSorter(tieBreakByCostComparer)).Sort(tempInput, tempSorted); var reader = new OfflineSorter.ByteSequencesReader(tempSorted); success = true; return(reader); } finally { if (success) { IOUtils.Dispose(writer); } else { try { IOUtils.DisposeWhileHandlingException(writer); } finally { Dispose(); } } } }
public void TestWithoutPayload() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); KeyValuePair <List <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), false, false); IDictionary <string, Document> docs = res.Value; List <string> invalidDocTerms = res.Key; foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { var field = f.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME); assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0); assertEquals(inputIterator.Payload, null); } foreach (string invalidTerm in invalidDocTerms) { var invalid = docs[invalidTerm]; docs.Remove(invalidTerm); assertNotNull(invalid); } assertTrue(!docs.Any()); ir.Dispose(); dir.Dispose(); }
public void TestWithContext() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); ValueSource[] toAdd = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) }; IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { string field = f.Utf8ToString(); Document doc = docs.ContainsKey(field) ? docs[field] : null; docs.Remove(field); long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault(); long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault(); long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault(); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, (w1 + w2 + w3)); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); ISet <BytesRef> originalCtxs = new HashSet <BytesRef>(); foreach (IIndexableField ctxf in doc.GetFields(CONTEXTS_FIELD_NAME)) { originalCtxs.add(ctxf.GetBinaryValue()); } assertEquals(originalCtxs, inputIterator.Contexts); } assertTrue(!docs.Any()); ir.Dispose(); dir.Dispose(); }
public void TestBasic() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); ValueSource[] toAdd = new ValueSource[] { new LongFieldSource(WEIGHT_FIELD_NAME_1), new LongFieldSource(WEIGHT_FIELD_NAME_2), new LongFieldSource(WEIGHT_FIELD_NAME_3) }; IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumFloatFunction(toAdd), PAYLOAD_FIELD_NAME); IInputIterator inputIterator = dictionary.EntryIterator; BytesRef f; while ((f = inputIterator.Next()) != null) { string field = f.Utf8ToString(); Document doc = docs.ContainsKey(field) ? docs[field] : null; docs.Remove(field); //Document doc = docs.remove(f.utf8ToString()); long w1 = Convert.ToInt64(doc.GetField(WEIGHT_FIELD_NAME_1).NumericValue); long w2 = Convert.ToInt64(doc.GetField(WEIGHT_FIELD_NAME_2).NumericValue); long w3 = Convert.ToInt64(doc.GetField(WEIGHT_FIELD_NAME_3).NumericValue); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, (w1 + w2 + w3)); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).BinaryValue)); } assertTrue(!docs.Any()); ir.Dispose(); dir.Dispose(); }
public void TestEmptyReader() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(NewLogMergePolicy()); // Make sure the index is created? RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc); writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); assertNull(inputIterator.Next()); assertEquals(inputIterator.Weight, 0); assertNull(inputIterator.Payload); ir.Dispose(); dir.Dispose(); }
public void TestWithoutPayload() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); foreach (Document doc in docs.Values) { writer.AddDocument(doc); } writer.Commit(); writer.Dispose(); IndexReader ir = DirectoryReader.Open(dir); ValueSource[] toAdd = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) }; IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd)); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { string field = f.Utf8ToString(); Document doc = docs[field]; docs.Remove(field); long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault(); long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault(); long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault(); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, (w1 + w2 + w3)); assertEquals(inputIterator.Payload, null); } assertTrue(docs.Count == 0); ir.Dispose(); dir.Dispose(); }
public void TestFileWithTerm() { KeyValuePair <List <List <string> >, string> fileInput = generateFileInput(AtLeast(100), FileDictionary.DEFAULT_FIELD_DELIMITER, false, false); Stream inputReader = new MemoryStream(fileInput.Value.getBytes(Encoding.UTF8)); FileDictionary dictionary = new FileDictionary(inputReader); List <List <string> > entries = fileInput.Key; IInputIterator inputIter = dictionary.GetEntryIterator(); assertFalse(inputIter.HasPayloads); BytesRef term; int count = 0; while ((term = inputIter.Next()) != null) { assertTrue(entries.size() > count); List <string> entry = entries[count]; assertTrue(entry.size() >= 1); // at least a term assertEquals(entry[0], term.Utf8ToString()); assertEquals(1, inputIter.Weight); assertNull(inputIter.Payload); count++; } assertEquals(count, entries.size()); }
public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Close(reader, writer, sorter); } else { IOUtils.CloseWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
public override void Build(IInputIterator iter) { if (searcherMgr != null) { searcherMgr.Dispose(); searcherMgr = null; } if (writer != null) { writer.Dispose(); writer = null; } AtomicReader r = null; bool success = false; try { // First pass: build a temporary normal Lucene index, // just indexing the suggestions as they iterate: writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GramAnalyzer, IndexWriterConfig.OpenMode_e.CREATE)); //long t0 = System.nanoTime(); // TODO: use threads? BytesRef text; while ((text = iter.Next()) != null) { BytesRef payload; if (iter.HasPayloads) { payload = iter.Payload; } else { payload = null; } Add(text, iter.Contexts, iter.Weight, payload); } //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec"); searcherMgr = new SearcherManager(writer, true, null); success = true; } finally { if (success) { IOUtils.Close(r); } else { IOUtils.CloseWhileHandlingException(writer, r); writer = null; } } }
public override void Build(IInputIterator iterator) { if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = iterator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while ((surfaceForm = iterator.Next()) != null) { ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new System.ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = iterator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(iterator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } Debug.Assert(output.Position == requiredLength, output.Position + " vs " + requiredLength); writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
public void TestWithDeletions() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc); KeyValuePair <List <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), false, false); IDictionary <string, Document> docs = res.Value; List <String> invalidDocTerms = res.Key; Random rand = Random(); List <string> termsToDel = new List <string>(); foreach (Document doc in docs.Values) { IIndexableField f2 = doc.GetField(FIELD_NAME); if (rand.nextBoolean() && f2 != null && !invalidDocTerms.Contains(f2.GetStringValue())) { termsToDel.Add(doc.Get(FIELD_NAME)); } writer.AddDocument(doc); } writer.Commit(); Term[] delTerms = new Term[termsToDel.size()]; for (int i = 0; i < termsToDel.size(); i++) { delTerms[i] = new Term(FIELD_NAME, termsToDel[i]); } foreach (Term delTerm in delTerms) { writer.DeleteDocuments(delTerm); } writer.Commit(); writer.Dispose(); foreach (string termToDel in termsToDel) { var toDel = docs[termToDel]; assertTrue(toDel != null); docs.Remove(termToDel); } IndexReader ir = DirectoryReader.Open(dir); assertEquals(ir.NumDocs, docs.size()); IDictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { var field = f.Utf8ToString(); Document doc = docs.ContainsKey(field) ? docs[field] : null; docs.Remove(field); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME); assertEquals(inputIterator.Weight, (weightField != null) ? Convert.ToInt64(weightField.GetNumericValue()) : 0); assertEquals(inputIterator.Payload, null); } foreach (string invalidTerm in invalidDocTerms) { var invalid = docs[invalidTerm]; docs.Remove(invalidTerm); assertNotNull(invalid); } assertTrue(!docs.Any()); ir.Dispose(); dir.Dispose(); }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // TODO: messy ... java7 has Files.createTempDirectory // ... but 4.x is java6: DirectoryInfo tempIndexPath = null; Random random = new Random(); while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + random.Next(int.MaxValue))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } using (Directory dir = FSDirectory.Open(tempIndexPath)) { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetIterator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { try { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } finally { foreach (string file in dir.ListAll()) { FileInfo path = new FileInfo(Path.Combine(tempIndexPath.FullName, file)); try { path.Delete(); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + path, e); } } try { tempIndexPath.Delete(); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + tempIndexPath, e); } } } } }
public override void Build(IInputIterator iterator) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } FileInfo tempInput = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir()); FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir()); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. bool success = false; count = 0; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = iterator.Next()) != null) { if (spare.Length + 4 >= buffer.Length) { buffer = ArrayUtil.Grow(buffer, spare.Length + 4); } output.Reset(buffer); output.WriteInt32(EncodeWeight(iterator.Weight)); output.WriteBytes(spare.Bytes, spare.Offset, spare.Length); writer.Write(buffer, 0, output.Position); } writer.Dispose(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted); tempInput.Delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); int inputLines = info.Lines; reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.Read(tmp1)) { input.Reset(tmp1.Bytes); int currentScore = input.ReadInt32(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int)(line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.Bytes = tmp1.Bytes; tmp2.Offset = input.Position; tmp2.Length = tmp1.Length - input.Position; builder.Add(tmp2, bucket); line++; count++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.Build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst); success = true; } finally { if (success) { IOUtils.Close(reader, writer, sorter); } else { IOUtils.CloseWhileHandlingException(reader, writer, sorter); } tempInput.Delete(); tempSorted.Delete(); } }
public void TestWithDeletions() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc); IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100)); Random rand = Random(); List <string> termsToDel = new List <string>(); foreach (Document doc in docs.Values) { if (rand.nextBoolean() && termsToDel.size() < docs.size() - 1) { termsToDel.Add(doc.Get(FIELD_NAME)); } writer.AddDocument(doc); } writer.Commit(); Term[] delTerms = new Term[termsToDel.size()]; for (int i = 0; i < termsToDel.size(); i++) { delTerms[i] = new Term(FIELD_NAME, termsToDel[i]); } foreach (Term delTerm in delTerms) { writer.DeleteDocuments(delTerm); } writer.Commit(); writer.Dispose(); foreach (string termToDel in termsToDel) { var toDel = docs[termToDel]; docs.Remove(termToDel); assertTrue(null != toDel); } IndexReader ir = DirectoryReader.Open(dir); assertTrue("NumDocs should be > 0 but was " + ir.NumDocs, ir.NumDocs > 0); assertEquals(ir.NumDocs, docs.size()); ValueSource[] toAdd = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2) }; IDictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME); IInputIterator inputIterator = dictionary.GetEntryIterator(); BytesRef f; while ((f = inputIterator.Next()) != null) { string field = f.Utf8ToString(); Document doc = docs.ContainsKey(field) ? docs[field] : null; docs.Remove(field); long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault(); long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault(); assertTrue(f.equals(new BytesRef(doc.Get(FIELD_NAME)))); assertEquals(inputIterator.Weight, w2 + w1); assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue())); } assertTrue(!docs.Any()); ir.Dispose(); dir.Dispose(); }