Esempio n. 1
0
        public override void Build(IInputEnumerator enumerator)
        {
            if (m_searcherMgr != null)
            {
                m_searcherMgr.Dispose();
                m_searcherMgr = null;
            }

            if (writer != null)
            {
                writer.Dispose();
                writer = null;
            }

            AtomicReader r       = null;
            bool         success = false;

            try
            {
                // First pass: build a temporary normal Lucene index,
                // just indexing the suggestions as they iterate:
                writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GetGramAnalyzer(), OpenMode.CREATE));
                //long t0 = System.nanoTime();

                // TODO: use threads?
                BytesRef text;
                while (enumerator.MoveNext())
                {
                    text = enumerator.Current;
                    BytesRef payload;
                    if (enumerator.HasPayloads)
                    {
                        payload = enumerator.Payload;
                    }
                    else
                    {
                        payload = null;
                    }

                    Add(text, enumerator.Contexts, enumerator.Weight, payload);
                }

                //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec");

                m_searcherMgr = new SearcherManager(writer, true, null);
                success       = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(r);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(writer, r);
                    writer = null;
                }
            }
        }
Esempio n. 2
0
        public void TestWithValueSource()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer            = new RandomIndexWriter(Random, dir, iwc);
            IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100));

            foreach (Document doc in docs.Values)
            {
                writer.AddDocument(doc);
            }
            writer.Commit();
            writer.Dispose();

            IndexReader      ir            = DirectoryReader.Open(dir);
            IDictionary      dictionary    = new DocumentValueSourceDictionary(ir, FIELD_NAME, new DoubleConstValueSource(10), PAYLOAD_FIELD_NAME);
            IInputEnumerator inputIterator = dictionary.GetEntryEnumerator();

            while (inputIterator.MoveNext())
            {
                string   field = inputIterator.Current.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME))));
                assertEquals(inputIterator.Weight, 10);
                assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue()));
            }
            assertTrue(docs.Count == 0);
            ir.Dispose();
            dir.Dispose();
        }
Esempio n. 3
0
        public void TestFileWithDifferentDelimiter()
        {
            KeyValuePair <IList <IList <string> >, string> fileInput = generateFileInput(AtLeast(100), " , ", true, true);
            Stream                  inputReader = new MemoryStream(fileInput.Value.getBytes(Encoding.UTF8));
            FileDictionary          dictionary  = new FileDictionary(inputReader, " , ");
            IList <IList <string> > entries     = fileInput.Key;
            IInputEnumerator        inputIter   = dictionary.GetEntryEnumerator();

            assertTrue(inputIter.HasPayloads);
            int count = 0;

            while (inputIter.MoveNext())
            {
                assertTrue(entries.size() > count);
                IList <string> entry = entries[count];
                assertTrue(entry.size() >= 2); // at least term and weight
                assertEquals(entry[0], inputIter.Current.Utf8ToString());
                assertEquals(long.Parse(entry[1], CultureInfo.InvariantCulture), inputIter.Weight);
                if (entry.size() == 3)
                {
                    assertEquals(entry[2], inputIter.Payload.Utf8ToString());
                }
                else
                {
                    assertEquals(inputIter.Payload.Length, 0);
                }
                count++;
            }
            assertEquals(count, entries.size());
        }
Esempio n. 4
0
        /// <summary>
        /// Creates a new iterator, buffering entries from the specified iterator </summary>
        public BufferedInputEnumerator(IInputEnumerator source)
        {
            int freqIndex = 0;

            hasPayloads = source.HasPayloads;
            hasContexts = source.HasContexts;
            while (source.MoveNext())
            {
                m_entries.Append(source.Current);
                if (hasPayloads)
                {
                    m_payloads.Append(source.Payload);
                }
                if (hasContexts)
                {
                    m_contextSets.Add(source.Contexts);
                }
                if (freqIndex >= m_freqs.Length)
                {
                    m_freqs = ArrayUtil.Grow(m_freqs, m_freqs.Length + 1);
                }
                m_freqs[freqIndex++] = source.Weight;
            }
            comp = source.Comparer;
        }
Esempio n. 5
0
        public override void Build(IInputEnumerator enumerator)
        {
            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            root = new TernaryTreeNode();
            // buffer first
#pragma warning disable 612, 618
            if (enumerator.Comparer != BytesRef.UTF8SortedAsUTF16Comparer)
            {
                // make sure it's sorted and the comparer uses UTF16 sort order
                enumerator = new SortedInputEnumerator(enumerator, BytesRef.UTF8SortedAsUTF16Comparer);
            }
#pragma warning restore 612, 618

            List <string> tokens = new List <string>();
            List <object> vals   = new List <object>();
            BytesRef      spare;
            CharsRef      charsSpare = new CharsRef();
            while (enumerator.MoveNext())
            {
                spare = enumerator.Current;
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                tokens.Add(charsSpare.ToString());
                vals.Add(enumerator.Weight);
            }
            autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root);
        }
Esempio n. 6
0
 internal WFSTInputEnumerator(IInputEnumerator source)
     : base(source)
 {
     if (Debugging.AssertsEnabled)
     {
         Debugging.Assert(source.HasPayloads == false);
     }
 }
Esempio n. 7
0
 public XoGame(GraphicsDevice device, GameControler gameController, Rectangle rect, ContentManager content, IInputEnumerator input)
 {
     this.gameController = gameController;
     this.screenRect = rect;
     this.content = content;
     this.device = device;
     this.inputEnumerator = input;
     gameLayers = new LayersCollection();
     scrollLayer = new ScrollLayer(gameLayers, screenRect);
 }
Esempio n. 8
0
        /// <summary>
        /// Creates a new sorted wrapper, sorting by BytesRef
        /// (ascending) then cost (ascending).
        /// </summary>
        public SortedInputEnumerator(IInputEnumerator source, IComparer <BytesRef> comparer)
        {
            this.tieBreakByCostComparer = Comparer <BytesRef> .Create((left, right) =>
            {
                BytesRef leftScratch     = new BytesRef();
                BytesRef rightScratch    = new BytesRef();
                ByteArrayDataInput input = new ByteArrayDataInput();
                // Make shallow copy in case decode changes the BytesRef:
                leftScratch.Bytes   = left.Bytes;
                leftScratch.Offset  = left.Offset;
                leftScratch.Length  = left.Length;
                rightScratch.Bytes  = right.Bytes;
                rightScratch.Offset = right.Offset;
                rightScratch.Length = right.Length;
                long leftCost       = Decode(leftScratch, input);
                long rightCost      = Decode(rightScratch, input);
                if (HasPayloads)
                {
                    DecodePayload(leftScratch, input);
                    DecodePayload(rightScratch, input);
                }
                if (HasContexts)
                {
                    DecodeContexts(leftScratch, input);
                    DecodeContexts(rightScratch, input);
                }
                // LUCENENET NOTE: outerInstance.Comparer != outerInstance.comparer!!
                int cmp = this.comparer.Compare(leftScratch, rightScratch);
                if (cmp != 0)
                {
                    return(cmp);
                }
                if (leftCost < rightCost)
                {
                    return(-1);
                }
                else if (leftCost > rightCost)
                {
                    return(1);
                }
                else
                {
                    return(0);
                }
            });

            this.hasPayloads = source.HasPayloads;
            this.hasContexts = source.HasContexts;
            this.source      = source;
            this.comparer    = comparer;
            this.reader      = Sort();
        }
Esempio n. 9
0
        public void TestWithContexts()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc);
            KeyValuePair <IList <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), true, true);
            IDictionary <string, Document> docs = res.Value;
            IList <string> invalidDocTerms      = res.Key;

            foreach (Document doc in docs.Values)
            {
                writer.AddDocument(doc);
            }
            writer.Commit();
            writer.Dispose();
            IndexReader      ir            = DirectoryReader.Open(dir);
            IDictionary      dictionary    = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME, CONTEXT_FIELD_NAME);
            IInputEnumerator inputIterator = dictionary.GetEntryEnumerator();

            while (inputIterator.MoveNext())
            {
                string   field = inputIterator.Current.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                //Document doc = docs.remove(f.utf8ToString());
                assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME))));
                IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME);
                assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0);
                assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue()));
                ISet <BytesRef>        oriCtxs    = new JCG.HashSet <BytesRef>();
                ICollection <BytesRef> contextSet = inputIterator.Contexts;
                foreach (IIndexableField ctxf in doc.GetFields(CONTEXT_FIELD_NAME))
                {
                    oriCtxs.add(ctxf.GetBinaryValue());
                }
                assertEquals(oriCtxs.size(), contextSet.Count);
            }

            foreach (string invalidTerm in invalidDocTerms)
            {
                var invalid = docs[invalidTerm];
                docs.Remove(invalidTerm);
                assertNotNull(invalid);
            }
            assertTrue(docs.Count == 0);

            ir.Dispose();
            dir.Dispose();
        }
Esempio n. 10
0
        public void TestWithContext()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer            = new RandomIndexWriter(Random, dir, iwc);
            IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100));

            foreach (Document doc in docs.Values)
            {
                writer.AddDocument(doc);
            }
            writer.Commit();
            writer.Dispose();

            IndexReader ir = DirectoryReader.Open(dir);

            ValueSource[]    toAdd         = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) };
            IDictionary      dictionary    = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME);
            IInputEnumerator inputIterator = dictionary.GetEntryEnumerator();

            while (inputIterator.MoveNext())
            {
                string   field = inputIterator.Current.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault();
                long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault();
                long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault();
                assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME))));
                assertEquals(inputIterator.Weight, (w1 + w2 + w3));
                assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue()));

                // LUCENENET NOTE: This test was once failing because we used SCG.HashSet<T> whose
                // Equals() implementation does not check for set equality. As a result SortedInputEnumerator
                // had been modified to reverse the results to get the test to pass. However, using JCG.HashSet<T>
                // ensures that set equality (that is equality that doesn't care about order of items) is respected.
                // SortedInputEnumerator has also had the specific sorting removed.
                ISet <BytesRef> originalCtxs = new JCG.HashSet <BytesRef>();
                foreach (IIndexableField ctxf in doc.GetFields(CONTEXTS_FIELD_NAME))
                {
                    originalCtxs.add(ctxf.GetBinaryValue());
                }
                assertEquals(originalCtxs, inputIterator.Contexts);
            }
            assertTrue(docs.Count == 0);
            ir.Dispose();
            dir.Dispose();
        }
Esempio n. 11
0
        public void TestWithoutPayload()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc);
            KeyValuePair <IList <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), false, false);
            IDictionary <string, Document> docs = res.Value;
            IList <string> invalidDocTerms      = res.Key;

            foreach (Document doc in docs.Values)
            {
                writer.AddDocument(doc);
            }
            writer.Commit();
            writer.Dispose();
            IndexReader      ir            = DirectoryReader.Open(dir);
            IDictionary      dictionary    = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME);
            IInputEnumerator inputIterator = dictionary.GetEntryEnumerator();

            while (inputIterator.MoveNext())
            {
                var      field = inputIterator.Current.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME))));
                IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME);
                assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0);
                assertEquals(inputIterator.Payload, null);
            }

            foreach (string invalidTerm in invalidDocTerms)
            {
                var invalid = docs[invalidTerm];
                docs.Remove(invalidTerm);
                assertNotNull(invalid);
            }


            assertTrue(docs.Count == 0);

            ir.Dispose();
            dir.Dispose();
        }
Esempio n. 12
0
        public override void Build(IInputEnumerator enumerator)
        {
            // LUCENENET: Added guard clause for null
            if (enumerator is null)
            {
                throw new ArgumentNullException(nameof(enumerator));
            }

            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            count = 0;
            BytesRef         scratch;
            IInputEnumerator iter = new WFSTInputEnumerator(enumerator);
            var      scratchInts  = new Int32sRef();
            BytesRef previous     = null;
            var      outputs      = PositiveInt32Outputs.Singleton;
            var      builder      = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);

            while (iter.MoveNext())
            {
                scratch = iter.Current;
                long cost = iter.Weight;

                if (previous == null)
                {
                    previous = new BytesRef();
                }
                else if (scratch.Equals(previous))
                {
                    continue; // for duplicate suggestions, the best weight is actually
                    // added
                }
                Lucene.Net.Util.Fst.Util.ToInt32sRef(scratch, scratchInts);
                builder.Add(scratchInts, cost);
                previous.CopyBytes(scratch);
                count++;
            }
            fst = builder.Finish();
        }
Esempio n. 13
0
        /// <summary>
        /// Creates a new iterator, wrapping the specified iterator and
        /// returning elements in a random order.
        /// </summary>
        public UnsortedInputEnumerator(IInputEnumerator source)
            : base(source)
        {
            ords = new int[m_entries.Length];
            Random random = new J2N.Randomizer();

            for (int i = 0; i < ords.Length; i++)
            {
                ords[i] = i;
            }
            for (int i = 0; i < ords.Length; i++)
            {
                int randomPosition = random.Next(ords.Length);
                int temp           = ords[i];
                ords[i] = ords[randomPosition];
                ords[randomPosition] = temp;
            }
        }
        public void TestWithContext()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer            = new RandomIndexWriter(Random, dir, iwc);
            IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100));

            foreach (Document doc in docs.Values)
            {
                writer.AddDocument(doc);
            }
            writer.Commit();
            writer.Dispose();

            IndexReader ir = DirectoryReader.Open(dir);

            ValueSource[]    toAdd         = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2), new Int64FieldSource(WEIGHT_FIELD_NAME_3) };
            IDictionary      dictionary    = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME);
            IInputEnumerator inputIterator = dictionary.GetEntryEnumerator();

            while (inputIterator.MoveNext())
            {
                string   field = inputIterator.Current.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault();
                long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault();
                long w3 = doc.GetField(WEIGHT_FIELD_NAME_3).GetInt64ValueOrDefault();
                assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME))));
                assertEquals(inputIterator.Weight, (w1 + w2 + w3));
                assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue()));
                ISet <BytesRef> originalCtxs = new JCG.HashSet <BytesRef>();
                foreach (IIndexableField ctxf in doc.GetFields(CONTEXTS_FIELD_NAME))
                {
                    originalCtxs.add(ctxf.GetBinaryValue());
                }
                assertEquals(originalCtxs, inputIterator.Contexts);
            }
            assertTrue(docs.Count == 0);
            ir.Dispose();
            dir.Dispose();
        }
Esempio n. 15
0
        public override void Build(IInputEnumerator enumerator)
        {
            // LUCENENET: Added guard clause for null
            if (enumerator is null)
            {
                throw new ArgumentNullException(nameof(enumerator));
            }

            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.Comparer != null)
            {
                // make sure it's unsorted
                // WTF - this could result in yet another sorted iteration....
                enumerator = new UnsortedInputEnumerator(enumerator);
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            count = 0;
            trie  = new JaspellTernarySearchTrie {
                MatchAlmostDiff = editDistance
            };
            BytesRef spare;

            var charsSpare = new CharsRef();

            while (enumerator.MoveNext())
            {
                spare = enumerator.Current;
                long weight = enumerator.Weight;
                if (spare.Length == 0)
                {
                    continue;
                }
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                trie.Put(charsSpare.ToString(), weight);
            }
        }
Esempio n. 16
0
        public void TestEmptyReader()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            // Make sure the index is created?
            RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc);

            writer.Commit();
            writer.Dispose();
            IndexReader      ir            = DirectoryReader.Open(dir);
            IDictionary      dictionary    = new DocumentValueSourceDictionary(ir, FIELD_NAME, new DoubleConstValueSource(10), PAYLOAD_FIELD_NAME);
            IInputEnumerator inputIterator = dictionary.GetEntryEnumerator();

            assertFalse(inputIterator.MoveNext());
            assertEquals(inputIterator.Weight, 0);
            assertNull(inputIterator.Payload);

            ir.Dispose();
            dir.Dispose();
        }
Esempio n. 17
0
        public void TestFileWithTerm()
        {
            KeyValuePair <IList <IList <string> >, string> fileInput = generateFileInput(AtLeast(100), FileDictionary.DEFAULT_FIELD_DELIMITER, false, false);
            Stream                  inputReader = new MemoryStream(fileInput.Value.getBytes(Encoding.UTF8));
            FileDictionary          dictionary  = new FileDictionary(inputReader);
            IList <IList <string> > entries     = fileInput.Key;
            IInputEnumerator        inputIter   = dictionary.GetEntryEnumerator();

            assertFalse(inputIter.HasPayloads);
            int count = 0;

            while (inputIter.MoveNext())
            {
                assertTrue(entries.size() > count);
                IList <string> entry = entries[count];
                assertTrue(entry.size() >= 1); // at least a term
                assertEquals(entry[0], inputIter.Current.Utf8ToString());
                assertEquals(1, inputIter.Weight);
                assertNull(inputIter.Payload);
                count++;
            }
            assertEquals(count, entries.size());
        }
Esempio n. 18
0
        /// <summary>
        /// Build the suggest index, using up to the specified
        /// amount of temporary RAM while building.  Note that
        /// the weights for the suggestions are ignored.
        /// </summary>
        public virtual void Build(IInputEnumerator enumerator, double ramBufferSizeMB)
        {
            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }

            string prefix    = this.GetType().Name;
            var    directory = OfflineSorter.DefaultTempDir();

            // LUCENENET specific - using GetRandomFileName() instead of picking a random int
            DirectoryInfo tempIndexPath = null;

            while (true)
            {
                tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName())));
                tempIndexPath.Create();
                if (System.IO.Directory.Exists(tempIndexPath.FullName))
                {
                    break;
                }
            }

            Directory dir = FSDirectory.Open(tempIndexPath);

            try
            {
#pragma warning disable 612, 618
                IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer);
#pragma warning restore 612, 618
                iwc.SetOpenMode(OpenMode.CREATE);
                iwc.SetRAMBufferSizeMB(ramBufferSizeMB);
                IndexWriter writer = new IndexWriter(dir, iwc);

                var ft = new FieldType(TextField.TYPE_NOT_STORED);
                // TODO: if only we had IndexOptions.TERMS_ONLY...
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                ft.OmitNorms    = true;
                ft.Freeze();

                Document doc   = new Document();
                Field    field = new Field("body", "", ft);
                doc.Add(field);

                totTokens = 0;
                IndexReader reader = null;

                bool success = false;
                count = 0;
                try
                {
                    while (enumerator.MoveNext())
                    {
                        BytesRef surfaceForm = enumerator.Current;
                        field.SetStringValue(surfaceForm.Utf8ToString());
                        writer.AddDocument(doc);
                        count++;
                    }

                    reader = DirectoryReader.Open(writer, false);

                    Terms terms = MultiFields.GetTerms(reader, "body");
                    if (terms == null)
                    {
                        throw new ArgumentException("need at least one suggestion");
                    }

                    // Move all ngrams into an FST:
                    TermsEnum termsEnum = terms.GetEnumerator(null);

                    Outputs <long?> outputs = PositiveInt32Outputs.Singleton;
                    Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);

                    Int32sRef scratchInts = new Int32sRef();
                    while (termsEnum.MoveNext())
                    {
                        BytesRef term       = termsEnum.Term;
                        int      ngramCount = CountGrams(term);
                        if (ngramCount > grams)
                        {
                            throw new ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
                        }
                        if (ngramCount == 1)
                        {
                            totTokens += termsEnum.TotalTermFreq;
                        }

                        builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq));
                    }

                    fst = builder.Finish();
                    if (fst == null)
                    {
                        throw new ArgumentException("need at least one suggestion");
                    }
                    //System.out.println("FST: " + fst.getNodeCount() + " nodes");

                    /*
                     * PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
                     * Util.toDot(fst, pw, true, true);
                     * pw.close();
                     */

                    success = true;
                }
                finally
                {
                    if (success)
                    {
                        IOUtils.Dispose(writer, reader);
                    }
                    else
                    {
                        IOUtils.DisposeWhileHandlingException(writer, reader);
                    }
                }
            }
            finally
            {
                try
                {
                    IOUtils.Dispose(dir);
                }
                finally
                {
                    // LUCENENET specific - since we are removing the entire directory anyway,
                    // it doesn't make sense to first do a loop in order remove the files.
                    // Let the System.IO.Directory.Delete() method handle that.
                    // We also need to dispose the Directory instance first before deleting from disk.
                    try
                    {
                        System.IO.Directory.Delete(tempIndexPath.FullName, true);
                    }
                    catch (Exception e)
                    {
                        throw new InvalidOperationException("failed to remove " + tempIndexPath, e);
                    }
                }
            }
        }
Esempio n. 19
0
 public override void Build(IInputEnumerator enumerator)
 {
     Build(enumerator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
 }
Esempio n. 20
0
        public void TestWithDeletions()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc);
            KeyValuePair <IList <string>, IDictionary <string, Document> > res = GenerateIndexDocuments(AtLeast(1000), false, false);
            IDictionary <string, Document> docs = res.Value;
            IList <String> invalidDocTerms      = res.Key;
            Random         rand       = Random;
            IList <string> termsToDel = new JCG.List <string>();

            foreach (Document doc in docs.Values)
            {
                IIndexableField f2 = doc.GetField(FIELD_NAME);
                if (rand.nextBoolean() && f2 != null && !invalidDocTerms.Contains(f2.GetStringValue()))
                {
                    termsToDel.Add(doc.Get(FIELD_NAME));
                }
                writer.AddDocument(doc);
            }
            writer.Commit();

            Term[] delTerms = new Term[termsToDel.size()];
            for (int i = 0; i < termsToDel.size(); i++)
            {
                delTerms[i] = new Term(FIELD_NAME, termsToDel[i]);
            }

            foreach (Term delTerm in delTerms)
            {
                writer.DeleteDocuments(delTerm);
            }
            writer.Commit();
            writer.Dispose();

            foreach (string termToDel in termsToDel)
            {
                var toDel = docs[termToDel];
                assertTrue(toDel != null);
                docs.Remove(termToDel);
            }

            IndexReader ir = DirectoryReader.Open(dir);

            assertEquals(ir.NumDocs, docs.size());
            IDictionary      dictionary    = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME);
            IInputEnumerator inputIterator = dictionary.GetEntryEnumerator();

            while (inputIterator.MoveNext())
            {
                var      field = inputIterator.Current.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME))));
                IIndexableField weightField = doc.GetField(WEIGHT_FIELD_NAME);
                assertEquals(inputIterator.Weight, (weightField != null) ? weightField.GetInt64ValueOrDefault() : 0);
                assertEquals(inputIterator.Payload, null);
            }

            foreach (string invalidTerm in invalidDocTerms)
            {
                var invalid = docs[invalidTerm];
                docs.Remove(invalidTerm);
                assertNotNull(invalid);
            }
            assertTrue(docs.Count == 0);

            ir.Dispose();
            dir.Dispose();
        }
Esempio n. 21
0
 /// <summary>
 /// Builds up a new internal <see cref="Lookup"/> representation based on the given <see cref="IInputEnumerator"/>.
 /// The implementation might re-sort the data internally.
 /// </summary>
 public abstract void Build(IInputEnumerator inputEnumerator);
Esempio n. 22
0
        public override void Build(IInputEnumerator enumerator)
        {
            // LUCENENET: Added guard clause for null
            if (enumerator is null)
            {
                throw new ArgumentNullException(nameof(enumerator));
            }

            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            FileInfo tempInput  = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir());
            FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir());

            OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
            OfflineSorter.ByteSequencesReader reader = null;
            ExternalRefSorter sorter = null;

            // Push floats up front before sequences to sort them. For now, assume they are non-negative.
            // If negative floats are allowed some trickery needs to be done to find their byte order.
            bool success = false;

            count = 0;
            try
            {
                byte[] buffer = Arrays.Empty <byte>();
                ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
                BytesRef            spare;
                while (enumerator.MoveNext())
                {
                    spare = enumerator.Current;
                    if (spare.Length + 4 >= buffer.Length)
                    {
                        buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
                    }

                    output.Reset(buffer);
                    output.WriteInt32(EncodeWeight(enumerator.Weight));
                    output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
                    writer.Write(buffer, 0, output.Position);
                }
                writer.Dispose();

                // We don't know the distribution of scores and we need to bucket them, so we'll sort
                // and divide into equal buckets.
                OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
                tempInput.Delete();
                FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);

                int inputLines = info.Lines;
                reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                long line                = 0;
                int  previousBucket      = 0;
                int  previousScore       = 0;
                ByteArrayDataInput input = new ByteArrayDataInput();
                BytesRef           tmp1  = new BytesRef();
                BytesRef           tmp2  = new BytesRef();
                while (reader.Read(tmp1))
                {
                    input.Reset(tmp1.Bytes);
                    int currentScore = input.ReadInt32();

                    int bucket;
                    if (line > 0 && currentScore == previousScore)
                    {
                        bucket = previousBucket;
                    }
                    else
                    {
                        bucket = (int)(line * buckets / inputLines);
                    }
                    previousScore  = currentScore;
                    previousBucket = bucket;

                    // Only append the input, discard the weight.
                    tmp2.Bytes  = tmp1.Bytes;
                    tmp2.Offset = input.Position;
                    tmp2.Length = tmp1.Length - input.Position;
                    builder.Add(tmp2, bucket);

                    line++;
                    count++;
                }

                // The two FSTCompletions share the same automaton.
                this.higherWeightsCompletion = builder.Build();
                this.normalCompletion        = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer, sorter);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer, sorter);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
Esempio n. 23
0
 /// <summary>
 /// Creates a new sorted wrapper, using <see cref="BytesRef.UTF8SortedAsUnicodeComparer"/>
 /// for sorting.
 /// </summary>
 public SortedInputEnumerator(IInputEnumerator source)
     : this(source, BytesRef.UTF8SortedAsUnicodeComparer)
 {
 }
Esempio n. 24
0
        public override void Build(IInputEnumerator enumerator)
        {
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            string prefix     = this.GetType().Name;
            var    directory  = OfflineSorter.DefaultTempDir();
            var    tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            var    tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            hasPayloads = enumerator.HasPayloads;

            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);

            OfflineSorter.ByteSequencesReader reader = null;
            var scratch = new BytesRef();

            TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

            bool success = false;

            count = 0;
            byte[] buffer = new byte[8];
            try
            {
                var      output = new ByteArrayDataOutput(buffer);
                BytesRef surfaceForm;

                while (enumerator.MoveNext())
                {
                    surfaceForm = enumerator.Current;
                    ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                    foreach (Int32sRef path in paths)
                    {
                        Util.Fst.Util.ToBytesRef(path, scratch);

                        // length of the analyzed text (FST input)
                        if (scratch.Length > ushort.MaxValue - 2)
                        {
                            throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                        " in length (got " + scratch.Length + ")");
                        }
                        ushort analyzedLength = (ushort)scratch.Length;

                        // compute the required length:
                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                        BytesRef payload;

                        if (hasPayloads)
                        {
                            if (surfaceForm.Length > (ushort.MaxValue - 2))
                            {
                                throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                            " in length (got " + surfaceForm.Length + ")");
                            }
                            payload = enumerator.Payload;
                            // payload + surfaceLength (short)
                            requiredLength += payload.Length + 2;
                        }
                        else
                        {
                            payload = null;
                        }

                        buffer = ArrayUtil.Grow(buffer, requiredLength);

                        output.Reset(buffer);

                        output.WriteInt16((short)analyzedLength);

                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                        output.WriteInt32(EncodeWeight(enumerator.Weight));

                        if (hasPayloads)
                        {
                            for (int i = 0; i < surfaceForm.Length; i++)
                            {
                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                {
                                    throw new ArgumentException(
                                              "surface form cannot contain unit separator character U+001F; this character is reserved");
                                }
                            }
                            output.WriteInt16((short)surfaceForm.Length);
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                        }
                        else
                        {
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                        }

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength);
                        }

                        writer.Write(buffer, 0, output.Position);
                    }
                    count++;
                }
                writer.Dispose();

                // Sort all input/output pairs (required by FST.Builder):
                (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                // Free disk space:
                tempInput.Delete();

                reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton,
                                                                ByteSequenceOutputs.Singleton);
                var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                // Build FST:
                BytesRef  previousAnalyzed = null;
                BytesRef  analyzed         = new BytesRef();
                BytesRef  surface          = new BytesRef();
                Int32sRef scratchInts      = new Int32sRef();
                var       input            = new ByteArrayDataInput();

                // Used to remove duplicate surface forms (but we
                // still index the hightest-weight one).  We clear
                // this when we see a new analyzed form, so it cannot
                // grow unbounded (at most 256 entries):
                var seenSurfaceForms = new JCG.HashSet <BytesRef>();

                var dedup = 0;
                while (reader.Read(scratch))
                {
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    ushort analyzedLength = (ushort)input.ReadInt16();
                    analyzed.Grow(analyzedLength + 2);
                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                    analyzed.Length = analyzedLength;

                    long cost = input.ReadInt32();

                    surface.Bytes = scratch.Bytes;
                    if (hasPayloads)
                    {
                        surface.Length = (ushort)input.ReadInt16();
                        surface.Offset = input.Position;
                    }
                    else
                    {
                        surface.Offset = input.Position;
                        surface.Length = scratch.Length - surface.Offset;
                    }

                    if (previousAnalyzed == null)
                    {
                        previousAnalyzed = new BytesRef();
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else if (analyzed.Equals(previousAnalyzed))
                    {
                        dedup++;
                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                        {
                            // More than maxSurfaceFormsPerAnalyzedForm
                            // dups: skip the rest:
                            continue;
                        }
                        if (seenSurfaceForms.Contains(surface))
                        {
                            continue;
                        }
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else
                    {
                        dedup = 0;
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Clear();
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }

                    // TODO: I think we can avoid the extra 2 bytes when
                    // there is no dup (dedup==0), but we'd have to fix
                    // the exactFirst logic ... which would be sort of
                    // hairy because we'd need to special case the two
                    // (dup/not dup)...

                    // NOTE: must be byte 0 so we sort before whatever
                    // is next
                    analyzed.Bytes[analyzed.Offset + analyzed.Length]     = 0;
                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                    analyzed.Length += 2;

                    Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                    if (!hasPayloads)
                    {
                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                    }
                    else
                    {
                        int      payloadOffset = input.Position + surface.Length;
                        int      payloadLength = scratch.Length - payloadOffset;
                        BytesRef br            = new BytesRef(surface.Length + 1 + payloadLength);
                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                        br.Bytes[surface.Length] = PAYLOAD_SEP;
                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                        br.Length = br.Bytes.Length;
                        builder.Add(scratchInts, outputs.NewPair(cost, br));
                    }
                }
                fst = builder.Finish();

                //Util.dotToFile(fst, "/tmp/suggest.dot");

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
Esempio n. 25
0
        public void TestWithDeletions()
        {
            Directory         dir = NewDirectory();
            IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));

            iwc.SetMergePolicy(NewLogMergePolicy());
            RandomIndexWriter writer            = new RandomIndexWriter(Random, dir, iwc);
            IDictionary <string, Document> docs = GenerateIndexDocuments(AtLeast(100));
            Random         rand       = Random;
            IList <string> termsToDel = new JCG.List <string>();

            foreach (Document doc in docs.Values)
            {
                if (rand.nextBoolean() && termsToDel.size() < docs.size() - 1)
                {
                    termsToDel.Add(doc.Get(FIELD_NAME));
                }
                writer.AddDocument(doc);
            }
            writer.Commit();

            Term[] delTerms = new Term[termsToDel.size()];
            for (int i = 0; i < termsToDel.size(); i++)
            {
                delTerms[i] = new Term(FIELD_NAME, termsToDel[i]);
            }

            foreach (Term delTerm in delTerms)
            {
                writer.DeleteDocuments(delTerm);
            }
            writer.Commit();
            writer.Dispose();

            foreach (string termToDel in termsToDel)
            {
                var toDel = docs[termToDel];
                docs.Remove(termToDel);
                assertTrue(null != toDel);
            }

            IndexReader ir = DirectoryReader.Open(dir);

            assertTrue("NumDocs should be > 0 but was " + ir.NumDocs, ir.NumDocs > 0);
            assertEquals(ir.NumDocs, docs.size());
            ValueSource[] toAdd = new ValueSource[] { new Int64FieldSource(WEIGHT_FIELD_NAME_1), new Int64FieldSource(WEIGHT_FIELD_NAME_2) };

            IDictionary      dictionary    = new DocumentValueSourceDictionary(ir, FIELD_NAME, new SumSingleFunction(toAdd), PAYLOAD_FIELD_NAME);
            IInputEnumerator inputIterator = dictionary.GetEntryEnumerator();

            while (inputIterator.MoveNext())
            {
                string   field = inputIterator.Current.Utf8ToString();
                Document doc   = docs[field];
                docs.Remove(field);
                long w1 = doc.GetField(WEIGHT_FIELD_NAME_1).GetInt64ValueOrDefault();
                long w2 = doc.GetField(WEIGHT_FIELD_NAME_2).GetInt64ValueOrDefault();
                assertTrue(inputIterator.Current.equals(new BytesRef(doc.Get(FIELD_NAME))));
                assertEquals(inputIterator.Weight, w2 + w1);
                assertTrue(inputIterator.Payload.equals(doc.GetField(PAYLOAD_FIELD_NAME).GetBinaryValue()));
            }
            assertTrue(docs.Count == 0);
            ir.Dispose();
            dir.Dispose();
        }