Пример #1
0
 public override BytesRef Encode(char[] buffer, int offset, int length)
 {
     float payload = float.Parse(new string(buffer, offset, length)); //TODO: improve this so that we don't have to new Strings
     byte[] bytes = PayloadHelper.EncodeFloat(payload);
     BytesRef result = new BytesRef(bytes);
     return result;
 }
Пример #2
0
 public override BytesRef Encode(char[] buffer, int offset, int length)
 {
     int payload = ArrayUtil.ParseInt(buffer, offset, length); //TODO: improve this so that we don't have to new Strings
     byte[] bytes = PayloadHelper.EncodeInt(payload);
     BytesRef result = new BytesRef(bytes);
     return result;
 }
 internal virtual byte[] Decompress(byte[] compressed, int originalLength, int offset, int length)
 {
     Decompressor decompressor = Mode.NewDecompressor();
     BytesRef bytes = new BytesRef();
     decompressor.Decompress(new ByteArrayDataInput(compressed), originalLength, offset, length, bytes);
     return Arrays.CopyOfRange(bytes.Bytes, bytes.Offset, bytes.Offset + bytes.Length);
 }
        public void TestBlendingType()
        {

            BytesRef pl = new BytesRef("lake");
            long w = 20;

            Input[] keys = new Input[]{
                new Input("top of the lake", w, pl)
            };

            DirectoryInfo tempDir = CreateTempDir("BlendedInfixSuggesterTest");
            Analyzer a = new StandardAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);

            // BlenderType.LINEAR is used by default (remove position*10%)
            BlendedInfixSuggester suggester = new BlendedInfixSuggester(TEST_VERSION_CURRENT, NewFSDirectory(tempDir), a);
            suggester.Build(new InputArrayIterator(keys));

            assertEquals(w, GetInResults(suggester, "top", pl, 1));
            assertEquals((int)(w * (1 - 0.10 * 2)), GetInResults(suggester, "the", pl, 1));
            assertEquals((int)(w * (1 - 0.10 * 3)), GetInResults(suggester, "lake", pl, 1));

            suggester.Dispose();

            // BlenderType.RECIPROCAL is using 1/(1+p) * w where w is weight and p the position of the word
            suggester = new BlendedInfixSuggester(TEST_VERSION_CURRENT, NewFSDirectory(tempDir), a, a,
                                                  AnalyzingInfixSuggester.DEFAULT_MIN_PREFIX_CHARS, BlendedInfixSuggester.BlenderType.POSITION_RECIPROCAL, 1);
            suggester.Build(new InputArrayIterator(keys));

            assertEquals(w, GetInResults(suggester, "top", pl, 1));
            assertEquals((int)(w * 1 / (1 + 2)), GetInResults(suggester, "the", pl, 1));
            assertEquals((int)(w * 1 / (1 + 3)), GetInResults(suggester, "lake", pl, 1));

            suggester.Dispose();
        }
Пример #5
0
 public DocFreqValueSource(string field, string val, string indexedField, BytesRef indexedBytes)
 {
     this.field = field;
     this.val = val;
     this.indexedField = indexedField;
     this.indexedBytes = indexedBytes;
 }
Пример #6
0
        public virtual void TestSimpleDictionary()
        {
            using (System.IO.Stream affixStream = this.GetType().getResourceAsStream("simple.aff"))
            {
                using (System.IO.Stream dictStream = this.GetType().getResourceAsStream("simple.dic"))
                {

                    Dictionary dictionary = new Dictionary(affixStream, dictStream);
                    assertEquals(3, dictionary.LookupSuffix(new char[] { 'e' }, 0, 1).Length);
                    assertEquals(1, dictionary.LookupPrefix(new char[] { 's' }, 0, 1).Length);
                    IntsRef ordList = dictionary.LookupWord(new char[] { 'o', 'l', 'r' }, 0, 3);
                    assertNotNull(ordList);
                    assertEquals(1, ordList.Length);

                    BytesRef @ref = new BytesRef();
                    dictionary.flagLookup.Get(ordList.Ints[0], @ref);
                    char[] flags = Dictionary.DecodeFlags(@ref);
                    assertEquals(1, flags.Length);

                    ordList = dictionary.LookupWord(new char[] { 'l', 'u', 'c', 'e', 'n' }, 0, 5);
                    assertNotNull(ordList);
                    assertEquals(1, ordList.Length);
                    dictionary.flagLookup.Get(ordList.Ints[0], @ref);
                    flags = Dictionary.DecodeFlags(@ref);
                    assertEquals(1, flags.Length);
                }
            }
        }
Пример #7
0
 internal TermStats(string field, BytesRef termtext, int df, long tf)
 {
     this.termtext = BytesRef.DeepCopyOf(termtext);
     this.Field = field;
     this.DocFreq = df;
     this.TotalTermFreq = tf;
 }
Пример #8
0
        public virtual void TestFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg, BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd)
        {
            Directory dir = NewDirectory();
            IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
            Document doc = new Document();
            doc.Add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
            doc.Add(new StringField("body", "body", Field.Store.YES));
            writer.AddDocument(doc);
            writer.Dispose();
            IndexReader reader = DirectoryReader.Open(dir);
            IndexSearcher searcher = new IndexSearcher(reader);
            Query query = new TermQuery(new Term("body", "body"));

            // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
            // orders the U+0698 character before the U+0633 character, so the single
            // index Term below should NOT be returned by a TermRangeFilter with a Farsi
            // Collator (or an Arabic one for the case when Farsi searcher not
            // supported).
            ScoreDoc[] result = searcher.Search(query, new TermRangeFilter("content", firstBeg, firstEnd, true, true), 1).ScoreDocs;
            Assert.AreEqual(0, result.Length, "The index Term should not be included.");

            result = searcher.Search(query, new TermRangeFilter("content", secondBeg, secondEnd, true, true), 1).ScoreDocs;
            Assert.AreEqual(1, result.Length, "The index Term should be included.");

            reader.Dispose();
            dir.Dispose();
        }
 public MockVariableLengthPayloadFilter(Random random, TokenStream @in)
     : base(@in)
 {
     this.Random = random;
     this.Payload = new BytesRef(Bytes);
     this.PayloadAtt = AddAttribute<IPayloadAttribute>();
 }
        private void SumValues(IList<FacetsCollector.MatchingDocs> matchingDocs)
        {
            //System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName);
            foreach (FacetsCollector.MatchingDocs hits in matchingDocs)
            {
                BinaryDocValues dv = hits.context.AtomicReader.GetBinaryDocValues(IndexFieldName);
                if (dv == null) // this reader does not have DocValues for the requested category list
                {
                    continue;
                }

                DocIdSetIterator docs = hits.bits.GetIterator();

                int doc;
                while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
                {
                    //System.out.println("  doc=" + doc);
                    // TODO: use OrdinalsReader?  we'd need to add a
                    // BytesRef getAssociation()?
                    BytesRef bytesRef = new BytesRef();
                    dv.Get(doc, bytesRef);
                    byte[] bytes = bytesRef.Bytes;
                    int end = bytesRef.Offset + bytesRef.Length;
                    int offset = bytesRef.Offset;
                    while (offset < end)
                    {
                        int ord = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF);
                        offset += 4;
                        int value = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF);
                        offset += 4;
                        values[ord] += value;
                    }
                }
            }
        }
Пример #11
0
 public virtual void TestEmpty()
 {
     BytesRef b = new BytesRef();
     Assert.AreEqual(BytesRef.EMPTY_BYTES, b.Bytes);
     Assert.AreEqual(0, b.Offset);
     Assert.AreEqual(0, b.Length);
 }
Пример #12
0
 public virtual void TestAppend()
 {
     var bytes = new[] { (byte)'a', (byte)'b', (byte)'c', (byte)'d' };
     BytesRef b = new BytesRef(bytes, 1, 3); // bcd
     b.Append(new BytesRef("e"));
     Assert.AreEqual("bcde", b.Utf8ToString());
 }
 /// <summary>
 /// Creates a new iterator, buffering entries from the specified iterator </summary>
 public BufferedInputIterator(InputIterator source)
 {
     BytesRef spare;
     int freqIndex = 0;
     hasPayloads = source.HasPayloads;
     hasContexts_Renamed = source.HasContexts;
     while ((spare = source.Next()) != null)
     {
         entries.Append(spare);
         if (hasPayloads)
         {
             payloads.Append(source.Payload);
         }
         if (hasContexts_Renamed)
         {
             contextSets.Add(source.Contexts);
         }
         if (freqIndex >= freqs.Length)
         {
             freqs = ArrayUtil.Grow(freqs, freqs.Length + 1);
         }
         freqs[freqIndex++] = source.Weight;
     }
     comp = source.Comparator;
 }
Пример #14
0
 public virtual void TestCopyBytes()
 {
     sbyte[] bytes = new sbyte[] { (sbyte)'a', (sbyte)'b', (sbyte)'c', (sbyte)'d' };
     BytesRef b = new BytesRef(bytes, 1, 3); // bcd
     b.CopyBytes(new BytesRef("bcde"));
     Assert.AreEqual("bcde", b.Utf8ToString());
 }
Пример #15
0
 public SrndPrefixQuery(string prefix, bool quoted, char truncator)
     : base(quoted)
 {
     this.prefix = prefix;
     prefixRef = new BytesRef(prefix);
     this.truncator = truncator;
 }
Пример #16
0
 public virtual void TestSize()
 {
     BytesRef @ref = new BytesRef();
     int num = AtLeast(2);
     for (int j = 0; j < num; j++)
     {
         int mod = 1 + Random().Next(39);
         for (int i = 0; i < 797; i++)
         {
             string str;
             do
             {
                 str = TestUtil.RandomRealisticUnicodeString(Random(), 1000);
             } while (str.Length == 0);
             @ref.CopyChars(str);
             int count = Hash.Size();
             int key = Hash.Add(@ref);
             if (key < 0)
             {
                 Assert.AreEqual(Hash.Size(), count);
             }
             else
             {
                 Assert.AreEqual(Hash.Size(), count + 1);
             }
             if (i % mod == 0)
             {
                 Hash.Clear();
                 Assert.AreEqual(0, Hash.Size());
                 Hash.Reinit();
             }
         }
     }
 }
Пример #17
0
        public static void Main(string[] args)
        {
            FileInfo input = new FileInfo("/home/dweiss/tmp/shuffled.dict");

            int buckets = 20;
            int shareMaxTail = 10;

            ExternalRefSorter sorter = new ExternalRefSorter(new OfflineSorter());
            FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail);

            TextReader reader =
                new StreamReader(
                    new FileStream(input.FullName, FileMode.Open), Encoding.UTF8);

            BytesRef scratch = new BytesRef();
            string line;
            int count = 0;
            while ((line = reader.ReadLine()) != null)
            {
                scratch.CopyChars(line);
                builder.Add(scratch, count % buckets);
                if ((count++ % 100000) == 0)
                {
                    Console.WriteLine("Line: " + count);
                }
            }

            Console.WriteLine("Building FSTCompletion.");
            FSTCompletion completion = builder.Build();

            FileInfo fstFile = new FileInfo("completion.fst");
            Console.WriteLine("Done. Writing automaton: " + fstFile.FullName);
            completion.FST.Save(fstFile);
            sorter.Dispose();
        }
Пример #18
0
        protected virtual void TruncatedToPrefixAndPattern()
        {
            int i = 0;
            while ((i < truncated.Length) && MatchingChar(truncated[i]))
            {
                i++;
            }
            prefix = truncated.Substring(0, i);
            prefixRef = new BytesRef(prefix);

            StringBuilder re = new StringBuilder();
            // LUCENENET NOTE: To mimic Java's matches() method, we alter
            // the Regex to match the entire string. This makes the Regex
            // fail fast when not at the beginning of the string, which is
            // more efficient than testing the length after a successful match.
            // http://stackoverflow.com/a/12547528/181087
            re.Append(@"\A(?:");
            while (i < truncated.Length)
            {
                AppendRegExpForChar(truncated[i], re);
                i++;
            }
            re.Append(@")\z");
            pattern = new Regex(re.ToString(), RegexOptions.Compiled);
        }
        public void TestBlendedSort()
        {

            BytesRef payload = new BytesRef("star");

            Input[] keys = new Input[]{
                new Input("star wars: episode v - the empire strikes back", 8, payload)
            };

            DirectoryInfo tempDir = CreateTempDir("BlendedInfixSuggesterTest");

            Analyzer a = new StandardAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
            BlendedInfixSuggester suggester = new BlendedInfixSuggester(TEST_VERSION_CURRENT, NewFSDirectory(tempDir), a, a,
                                                                        AnalyzingInfixSuggester.DEFAULT_MIN_PREFIX_CHARS,
                                                                        BlendedInfixSuggester.BlenderType.POSITION_LINEAR,
                                                                        BlendedInfixSuggester.DEFAULT_NUM_FACTOR);
            suggester.Build(new InputArrayIterator(keys));

            // we query for star wars and check that the weight
            // is smaller when we search for tokens that are far from the beginning

            long w0 = GetInResults(suggester, "star ", payload, 1);
            long w1 = GetInResults(suggester, "war", payload, 1);
            long w2 = GetInResults(suggester, "empire ba", payload, 1);
            long w3 = GetInResults(suggester, "back", payload, 1);
            long w4 = GetInResults(suggester, "bacc", payload, 1);

            assertTrue(w0 > w1);
            assertTrue(w1 > w2);
            assertTrue(w2 > w3);

            assertTrue(w4 < 0);

            suggester.Dispose();
        }
        public virtual void AddValue(int docID, BytesRef value)
        {
            if (value == null)
            {
                throw new System.ArgumentException("field \"" + FieldInfo.Name + "\": null value not allowed");
            }
            if (value.Length > (ByteBlockPool.BYTE_BLOCK_SIZE - 2))
            {
                throw new System.ArgumentException("DocValuesField \"" + FieldInfo.Name + "\" is too large, must be <= " + (ByteBlockPool.BYTE_BLOCK_SIZE - 2));
            }

            if (docID != CurrentDoc)
            {
                FinishCurrentDoc();
            }

            // Fill in any holes:
            while (CurrentDoc < docID)
            {
                PendingCounts.Add(0); // no values
                CurrentDoc++;
            }

            AddOneValue(value);
            UpdateBytesUsed();
        }
Пример #21
0
        public virtual void TestFixedSorted([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")]IConcurrentMergeScheduler scheduler)
        {
            BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BFixedSorted"));
            if (dir is MockDirectoryWrapper)
            {
                ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER;
            }

            IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))
                                .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                                .SetRAMBufferSizeMB(256.0)
                                .SetMergeScheduler(scheduler)
                                .SetMergePolicy(NewLogMergePolicy(false, 10))
                                .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE));

            Document doc = new Document();
            var bytes = new byte[2];
            BytesRef data = new BytesRef(bytes);
            SortedDocValuesField dvField = new SortedDocValuesField("dv", data);
            doc.Add(dvField);

            for (int i = 0; i < int.MaxValue; i++)
            {
                bytes[0] = (byte)(i >> 8);
                bytes[1] = (byte)i;
                w.AddDocument(doc);
                if (i % 100000 == 0)
                {
                    Console.WriteLine("indexed: " + i);
                    Console.Out.Flush();
                }
            }

            w.ForceMerge(1);
            w.Dispose();

            Console.WriteLine("verifying...");
            Console.Out.Flush();

            DirectoryReader r = DirectoryReader.Open(dir);
            int expectedValue = 0;
            foreach (AtomicReaderContext context in r.Leaves)
            {
                AtomicReader reader = context.AtomicReader;
                BytesRef scratch = new BytesRef();
                BinaryDocValues dv = reader.GetSortedDocValues("dv");
                for (int i = 0; i < reader.MaxDoc; i++)
                {
                    bytes[0] = (byte)(expectedValue >> 8);
                    bytes[1] = (byte)expectedValue;
                    dv.Get(i, scratch);
                    Assert.AreEqual(data, scratch);
                    expectedValue++;
                }
            }

            r.Dispose();
            dir.Dispose();
        }
Пример #22
0
 public TermStatistics(BytesRef term, long docFreq, long totalTermFreq)
 {
     Debug.Assert(docFreq >= 0);
     Debug.Assert(totalTermFreq == -1 || totalTermFreq >= docFreq); // #positions must be >= #postings
     this.Term_Renamed = term;
     this.DocFreq_Renamed = docFreq;
     this.TotalTermFreq_Renamed = totalTermFreq;
 }
Пример #23
0
 /// <summary>
 /// Constructs a query selecting all terms greater/equal than <code>lowerTerm</code>
 /// but less/equal than <code>upperTerm</code>.
 ///
 /// <p>
 /// If an endpoint is null, it is said
 /// to be "open". Either or both endpoints may be open.  Open endpoints may not
 /// be exclusive (you can't select all but the first or last term without
 /// explicitly specifying the term to exclude.)
 /// </summary>
 /// <param name="field"> The field that holds both lower and upper terms. </param>
 /// <param name="lowerTerm">
 ///          The term text at the lower end of the range </param>
 /// <param name="upperTerm">
 ///          The term text at the upper end of the range </param>
 /// <param name="includeLower">
 ///          If true, the <code>lowerTerm</code> is
 ///          included in the range. </param>
 /// <param name="includeUpper">
 ///          If true, the <code>upperTerm</code> is
 ///          included in the range. </param>
 public TermRangeQuery(string field, BytesRef lowerTerm, BytesRef upperTerm, bool includeLower, bool includeUpper)
     : base(field)
 {
     this.LowerTerm_Renamed = lowerTerm;
     this.UpperTerm_Renamed = upperTerm;
     this.IncludeLower = includeLower;
     this.IncludeUpper = includeUpper;
 }
Пример #24
0
 private DocTermOrdsRangeFilter(string field, BytesRef lowerVal, BytesRef upperVal, bool includeLower, bool includeUpper)
 {
     this.Field_Renamed = field;
     this.LowerVal_Renamed = lowerVal;
     this.UpperVal_Renamed = upperVal;
     this.IncludeLower = includeLower;
     this.IncludeUpper = includeUpper;
 }
Пример #25
0
 public void Add(BytesRef utf8)
 {
     if (closed)
     {
         throw new InvalidOperationException();
     }
     buffer.Append(utf8);
 }
Пример #26
0
 public void Add(BytesRef utf8)
 {
     if (writer == null)
     {
         throw new InvalidOperationException();
     }
     writer.Write(utf8);
 }
Пример #27
0
 public DocTermOrdsRangeFilterAnonymousInnerClassHelper(string field, BytesRef lowerVal, BytesRef upperVal, bool includeLower, bool includeUpper)
     : base(field, lowerVal, upperVal, includeLower, includeUpper)
 {
     this.Field = field;
     this.LowerVal = lowerVal;
     this.UpperVal = upperVal;
     this.IncludeLower = includeLower;
     this.IncludeUpper = includeUpper;
 }
 internal Iterator(int size, PagedGrowableWriter offsets, PagedGrowableWriter lengths, PagedMutable docs, BytesRef values, FixedBitSet docsWithField)
 {
     this.Offsets = offsets;
     this.Size = size;
     this.Lengths = lengths;
     this.Docs = docs;
     this.DocsWithField = docsWithField;
     Value_Renamed = (BytesRef)values.Clone();
 }
Пример #29
0
 public Input(BytesRef term, long v, BytesRef payload, bool hasPayloads, IEnumerable<BytesRef> contexts,
     bool hasContexts)
 {
     this.term = term;
     this.v = v;
     this.payload = payload;
     this.hasPayloads = hasPayloads;
     this.contexts = contexts;
     this.hasContexts = hasContexts;
 }
Пример #30
0
		/// <summary>
		/// Factory method for creating the right implementation based on the fact whether the facet field contains
		/// multiple tokens per documents.
		/// </summary>
		/// <remarks>
		/// Factory method for creating the right implementation based on the fact whether the facet field contains
		/// multiple tokens per documents.
		/// </remarks>
		/// <param name="groupField">The group field</param>
		/// <param name="facetField">The facet field</param>
		/// <param name="facetFieldMultivalued">Whether the facet field has multiple tokens per document
		/// 	</param>
		/// <param name="facetPrefix">The facet prefix a facet entry should start with to be included.
		/// 	</param>
		/// <param name="initialSize">
		/// The initial allocation size of the internal int set and group facet list which should roughly
		/// match the total number of expected unique groups. Be aware that the heap usage is
		/// 4 bytes * initialSize.
		/// </param>
		/// <returns><code>TermGroupFacetCollector</code> implementation</returns>
		public static TermGroupFacetCollector CreateTermGroupFacetCollector
			(string groupField, string facetField, bool facetFieldMultivalued, BytesRef facetPrefix
			, int initialSize)
		{
		    if (facetFieldMultivalued)
			{
				return new MV(groupField, facetField, facetPrefix, initialSize);
			}
		    return new SV(groupField, facetField, facetPrefix, initialSize);
		}
Пример #31
0
        /// <summary>
        /// Call this only once (if you subclass!) </summary>
        protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix)
        {
            FieldInfo info = reader.FieldInfos.FieldInfo(m_field);

            if (info != null && info.HasDocValues)
            {
                throw new InvalidOperationException("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType);
            }
            //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
            long startTime = Environment.TickCount;

            m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix);

            int maxDoc = reader.MaxDoc;

            int[] index    = new int[maxDoc];     // immediate term numbers, or the index into the byte[] representing the last number
            int[] lastTerm = new int[maxDoc];     // last term we saw for this document
            var   bytes    = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

            Fields fields = reader.Fields;

            if (fields == null)
            {
                // No terms
                return;
            }
            Terms terms = fields.GetTerms(m_field);

            if (terms == null)
            {
                // No terms
                return;
            }

            TermsEnum te        = terms.GetIterator(null);
            BytesRef  seekStart = termPrefix != null ? termPrefix : new BytesRef();

            //System.out.println("seekStart=" + seekStart.utf8ToString());
            if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END)
            {
                // No terms match
                return;
            }

            // If we need our "term index wrapper", these will be
            // init'd below:
            IList <BytesRef> indexedTerms      = null;
            PagedBytes       indexedTermsBytes = null;

            bool testedOrd = false;

            // we need a minimum of 9 bytes, but round up to 12 since the space would
            // be wasted with most allocators anyway.
            var tempArr = new sbyte[12];

            //
            // enumerate all terms, and build an intermediate form of the un-inverted field.
            //
            // During this intermediate form, every document has a (potential) byte[]
            // and the int[maxDoc()] array either contains the termNumber list directly
            // or the *end* offset of the termNumber list in it's byte array (for faster
            // appending and faster creation of the final form).
            //
            // idea... if things are too large while building, we could do a range of docs
            // at a time (but it would be a fair amount slower to build)
            // could also do ranges in parallel to take advantage of multiple CPUs

            // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
            // values.  this requires going over the field first to find the most
            // frequent terms ahead of time.

            int termNum = 0;

            m_docsEnum = null;

            // Loop begins with te positioned to first term (we call
            // seek above):
            for (; ;)
            {
                BytesRef t = te.Term;
                if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix)))
                {
                    break;
                }
                //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

                if (!testedOrd)
                {
                    try
                    {
                        m_ordBase = (int)te.Ord;
                        //System.out.println("got ordBase=" + ordBase);
                    }
#pragma warning disable 168
                    catch (NotSupportedException uoe)
#pragma warning restore 168
                    {
                        // Reader cannot provide ord support, so we wrap
                        // our own support by creating our own terms index:
                        indexedTerms      = new List <BytesRef>();
                        indexedTermsBytes = new PagedBytes(15);
                        //System.out.println("NO ORDS");
                    }
                    testedOrd = true;
                }

                VisitTerm(te, termNum);

                if (indexedTerms != null && (termNum & indexIntervalMask) == 0)
                {
                    // Index this term
                    m_sizeOfIndexedStrings += t.Length;
                    BytesRef indexedTerm = new BytesRef();
                    indexedTermsBytes.Copy(t, indexedTerm);
                    // TODO: really should 1) strip off useless suffix,
                    // and 2) use FST not array/PagedBytes
                    indexedTerms.Add(indexedTerm);
                }

                int df = te.DocFreq;
                if (df <= m_maxTermDocFreq)
                {
                    m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE);

                    // dF, but takes deletions into account
                    int actualDF = 0;

                    for (; ;)
                    {
                        int doc = m_docsEnum.NextDoc();
                        if (doc == DocIdSetIterator.NO_MORE_DOCS)
                        {
                            break;
                        }
                        //System.out.println("  chunk=" + chunk + " docs");

                        actualDF++;
                        m_termInstances++;

                        //System.out.println("    docID=" + doc);
                        // add TNUM_OFFSET to the term number to make room for special reserved values:
                        // 0 (end term) and 1 (index into byte array follows)
                        int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
                        lastTerm[doc] = termNum;
                        int val = index[doc];

                        if ((val & 0xff) == 1)
                        {
                            // index into byte array (actually the end of
                            // the doc-specific byte[] when building)
                            int pos    = (int)((uint)val >> 8);
                            int ilen   = VInt32Size(delta);
                            var arr    = bytes[doc];
                            int newend = pos + ilen;
                            if (newend > arr.Length)
                            {
                                // We avoid a doubling strategy to lower memory usage.
                                // this faceting method isn't for docs with many terms.
                                // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
                                // TODO: figure out what array lengths we can round up to w/o actually using more memory
                                // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
                                // It should be safe to round up to the nearest 32 bits in any case.
                                int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment
                                var newarr = new sbyte[newLen];
                                Array.Copy(arr, 0, newarr, 0, pos);
                                arr        = newarr;
                                bytes[doc] = newarr;
                            }
                            pos        = WriteInt32(delta, arr, pos);
                            index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
                        }
                        else
                        {
                            // OK, this int has data in it... find the end (a zero starting byte - not
                            // part of another number, hence not following a byte with the high bit set).
                            int ipos;
                            if (val == 0)
                            {
                                ipos = 0;
                            }
                            else if ((val & 0x0000ff80) == 0)
                            {
                                ipos = 1;
                            }
                            else if ((val & 0x00ff8000) == 0)
                            {
                                ipos = 2;
                            }
                            else if ((val & 0xff800000) == 0)
                            {
                                ipos = 3;
                            }
                            else
                            {
                                ipos = 4;
                            }

                            //System.out.println("      ipos=" + ipos);

                            int endPos = WriteInt32(delta, tempArr, ipos);
                            //System.out.println("      endpos=" + endPos);
                            if (endPos <= 4)
                            {
                                //System.out.println("      fits!");
                                // value will fit in the integer... move bytes back
                                for (int j = ipos; j < endPos; j++)
                                {
                                    val |= (tempArr[j] & 0xff) << (j << 3);
                                }
                                index[doc] = val;
                            }
                            else
                            {
                                // value won't fit... move integer into byte[]
                                for (int j = 0; j < ipos; j++)
                                {
                                    tempArr[j] = (sbyte)val;
                                    val        = (int)((uint)val >> 8);
                                }
                                // point at the end index in the byte[]
                                index[doc] = (endPos << 8) | 1;
                                bytes[doc] = tempArr;
                                tempArr    = new sbyte[12];
                            }
                        }
                    }
                    SetActualDocFreq(termNum, actualDF);
                }

                termNum++;
                if (te.Next() == null)
                {
                    break;
                }
            }

            m_numTermsInField = termNum;

            long midPoint = Environment.TickCount;

            if (m_termInstances == 0)
            {
                // we didn't invert anything
                // lower memory consumption.
                m_tnums = null;
            }
            else
            {
                this.m_index = index;

                //
                // transform intermediate form into the final form, building a single byte[]
                // at a time, and releasing the intermediate byte[]s as we go to avoid
                // increasing the memory footprint.
                //

                for (int pass = 0; pass < 256; pass++)
                {
                    var target = m_tnums[pass];
                    var pos    = 0; // end in target;
                    if (target != null)
                    {
                        pos = target.Length;
                    }
                    else
                    {
                        target = new sbyte[4096];
                    }

                    // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
                    // where pp is the pass (which array we are building), and xx is all values.
                    // each pass shares the same byte[] for termNumber lists.
                    for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24))
                    {
                        int lim = Math.Min(docbase + (1 << 16), maxDoc);
                        for (int doc = docbase; doc < lim; doc++)
                        {
                            //System.out.println("  pass="******" process docID=" + doc);
                            int val = index[doc];
                            if ((val & 0xff) == 1)
                            {
                                int len = (int)((uint)val >> 8);
                                //System.out.println("    ptr pos=" + pos);
                                index[doc] = (pos << 8) | 1; // change index to point to start of array
                                if ((pos & 0xff000000) != 0)
                                {
                                    // we only have 24 bits for the array index
                                    throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + m_field);
                                }
                                var arr = bytes[doc];

                                /*
                                 * for(byte b : arr) {
                                 * //System.out.println("      b=" + Integer.toHexString((int) b));
                                 * }
                                 */
                                bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
                                if (target.Length <= pos + len)
                                {
                                    int newlen = target.Length;

                                    //* we don't have to worry about the array getting too large
                                    // since the "pos" param will overflow first (only 24 bits available)
                                    // if ((newlen<<1) <= 0) {
                                    //  // overflow...
                                    //  newlen = Integer.MAX_VALUE;
                                    //  if (newlen <= pos + len) {
                                    //    throw new SolrException(400,"Too many terms to uninvert field!");
                                    //  }
                                    // } else {
                                    //  while (newlen <= pos + len) newlen<<=1;  // doubling strategy
                                    // }
                                    //
                                    while (newlen <= pos + len) // doubling strategy
                                    {
                                        newlen <<= 1;
                                    }
                                    var newtarget = new sbyte[newlen];
                                    Array.Copy(target, 0, newtarget, 0, pos);
                                    target = newtarget;
                                }
                                Array.Copy(arr, 0, target, pos, len);
                                pos += len + 1; // skip single byte at end and leave it 0 for terminator
                            }
                        }
                    }

                    // shrink array
                    if (pos < target.Length)
                    {
                        var newtarget = new sbyte[pos];
                        Array.Copy(target, 0, newtarget, 0, pos);
                        target = newtarget;
                    }

                    m_tnums[pass] = target;

                    if ((pass << 16) > maxDoc)
                    {
                        break;
                    }
                }
            }
            if (indexedTerms != null)
            {
                m_indexedTermsArray = new BytesRef[indexedTerms.Count];
                indexedTerms.CopyTo(m_indexedTermsArray, 0);
            }

            long endTime = Environment.TickCount;

            m_total_time  = (int)(endTime - startTime);
            m_phase1_time = (int)(midPoint - startTime);
        }
Пример #32
0
            // Swap in S, in place of E:
            private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
            {
                int savLength = term.Length;

                Debug.Assert(term.Offset == 0);

                // The 3 bytes starting at downTo make up 1
                // unicode character:
                Debug.Assert(IsHighBMPChar(term.Bytes, pos));

                // NOTE: we cannot make this assert, because
                // AutomatonQuery legitimately sends us malformed UTF8
                // (eg the UTF8 bytes with just 0xee)
                // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

                // Save the bytes && length, since we need to
                // restore this if seek "back" finds no matching
                // terms
                if (term.Bytes.Length < 4 + pos)
                {
                    term.Grow(4 + pos);
                }

                scratch[0] = (sbyte)term.Bytes[pos];
                scratch[1] = (sbyte)term.Bytes[pos + 1];
                scratch[2] = (sbyte)term.Bytes[pos + 2];

                term.Bytes[pos]     = 0xf0;
                term.Bytes[pos + 1] = 0x90;
                term.Bytes[pos + 2] = 0x80;
                term.Bytes[pos + 3] = 0x80;
                term.Length         = 4 + pos;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }

                // Seek "back":
                outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);

                // Test if the term we seek'd to in fact found a
                // surrogate pair at the same position as the E:
                Term t2 = te.Term();

                // Cannot be null (or move to next field) because at
                // "worst" it'd seek to the same term we are on now,
                // unless we are being called from seek
                if (t2 == null || t2.Field != internedFieldName)
                {
                    return(false);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()));
                }

                // Now test if prefix is identical and we found
                // a non-BMP char at the same position:
                BytesRef b2 = t2.Bytes;

                Debug.Assert(b2.Offset == 0);

                bool matches;

                if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
                {
                    matches = true;
                    for (int i = 0; i < pos; i++)
                    {
                        if (term.Bytes[i] != b2.Bytes[i])
                        {
                            matches = false;
                            break;
                        }
                    }
                }
                else
                {
                    matches = false;
                }

                // Restore term:
                term.Length         = savLength;
                term.Bytes[pos]     = (byte)scratch[0];
                term.Bytes[pos + 1] = (byte)scratch[1];
                term.Bytes[pos + 2] = (byte)scratch[2];

                return(matches);
            }
Пример #33
0
 public LiteralValueSource(string str)
 {
     this.m_str      = str;
     this.m_bytesRef = new BytesRef(str);
 }
Пример #34
0
            // Look for seek type 1 ("push"): if the newly added
            // suffix contains any S, we must try to seek to the
            // corresponding E.  If we find a match, we go there;
            // else we keep looking for additional S's in the new
            // suffix.  this "starts" the dance, at this character
            // position:
            private void DoPushes()
            {
                int upTo = newSuffixStart;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length);
                }

                while (upTo < scratchTerm.Length)
                {
                    if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo)))))
                    {
                        // A non-BMP char (4 bytes UTF8) starts here:
                        Debug.Assert(scratchTerm.Length >= upTo + 4);

                        int savLength = scratchTerm.Length;
                        scratch[0] = (sbyte)scratchTerm.Bytes[upTo];
                        scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1];
                        scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2];

                        scratchTerm.Bytes[upTo]     = (byte)UTF8_HIGH_BMP_LEAD;
                        scratchTerm.Bytes[upTo + 1] = 0x80;
                        scratchTerm.Bytes[upTo + 2] = 0x80;
                        scratchTerm.Length          = upTo + 3;

                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("    try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
                        }

                        // Seek "forward":
                        // TODO: more efficient seek?
                        outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true);

                        scratchTerm.Bytes[upTo]     = (byte)scratch[0];
                        scratchTerm.Bytes[upTo + 1] = (byte)scratch[1];
                        scratchTerm.Bytes[upTo + 2] = (byte)scratch[2];
                        scratchTerm.Length          = savLength;

                        // Did we find a match?
                        Term t2 = seekTermEnum.Term();

                        if (DEBUG_SURROGATES)
                        {
                            if (t2 == null)
                            {
                                Console.WriteLine("      hit term=null");
                            }
                            else
                            {
                                Console.WriteLine("      hit term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + (t2 == null ? null : t2.Bytes));
                            }
                        }

                        // Since this was a seek "forward", we could hit
                        // EOF or a different field:
                        bool matches;

                        if (t2 != null && t2.Field == internedFieldName)
                        {
                            BytesRef b2 = t2.Bytes;
                            Debug.Assert(b2.Offset == 0);
                            if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo))
                            {
                                matches = true;
                                for (int i = 0; i < upTo; i++)
                                {
                                    if (scratchTerm.Bytes[i] != b2.Bytes[i])
                                    {
                                        matches = false;
                                        break;
                                    }
                                }
                            }
                            else
                            {
                                matches = false;
                            }
                        }
                        else
                        {
                            matches = false;
                        }

                        if (matches)
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      matches!");
                            }

                            // OK seek "back"
                            // TODO: more efficient seek?
                            outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true);

                            scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);

                            // +3 because we don't need to check the char
                            // at upTo: we know it's > BMP
                            upTo += 3;

                            // NOTE: we keep iterating, now, since this
                            // can easily "recurse".  Ie, after seeking
                            // forward at a certain char position, we may
                            // find another surrogate in our [new] suffix
                            // and must then do another seek (recurse)
                        }
                        else
                        {
                            upTo++;
                        }
                    }
                    else
                    {
                        upTo++;
                    }
                }
            }
Пример #35
0
 public override void AddPosition(int position, int startOffset, int endOffset, BytesRef payload)
 {
     if (Debugging.AssertsEnabled)
     {
         Debugging.Assert(curField.flags != 0);
     }
     curField.AddPosition(position, startOffset, endOffset - startOffset, payload == null ? 0 : payload.Length);
     if (curField.hasPayloads && payload != null)
     {
         payloadBytes.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
     }
 }
Пример #36
0
 /// <summary>
 /// The default implementation returns <code>1</code>
 /// </summary>
 public override float ScorePayload(int doc, int start, int end, BytesRef payload)
 {
     return(1);
 }
Пример #37
0
            public override SeekStatus SeekCeil(BytesRef term)
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }
                skipNext = false;
                TermInfosReader tis = outerInstance.TermsDict;
                Term            t0  = new Term(fieldInfo.Name, term);

                Debug.Assert(termEnum != null);

                tis.SeekEnum(termEnum, t0, false);

                Term t = termEnum.Term();

                if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes))
                {
                    // If we found an exact match, no need to do the
                    // surrogate dance
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek exact match");
                    }
                    current = t.Bytes;
                    return(SeekStatus.FOUND);
                }
                else if (t == null || t.Field != internedFieldName)
                {
                    // TODO: maybe we can handle this like the next()
                    // into null?  set term as prevTerm then dance?

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek hit EOF");
                    }

                    // We hit EOF; try end-case surrogate dance: if we
                    // find an E, try swapping in S, backwards:
                    scratchTerm.CopyBytes(term);

                    Debug.Assert(scratchTerm.Offset == 0);

                    for (int i = scratchTerm.Length - 1; i >= 0; i--)
                    {
                        if (IsHighBMPChar(scratchTerm.Bytes, i))
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("    found E pos=" + i + "; try seek");
                            }

                            if (SeekToNonBMP(seekTermEnum, scratchTerm, i))
                            {
                                scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);
                                outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false);

                                newSuffixStart = 1 + i;

                                DoPushes();

                                // Found a match
                                // TODO: faster seek?
                                current = termEnum.Term().Bytes;
                                return(SeekStatus.NOT_FOUND);
                            }
                        }
                    }

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek END");
                    }

                    current = null;
                    return(SeekStatus.END);
                }
                else
                {
                    // We found a non-exact but non-null term; this one
                    // is fun -- just treat it like next, by pretending
                    // requested term was prev:
                    prevTerm.CopyBytes(term);

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text()));
                    }

                    BytesRef br = t.Bytes;
                    Debug.Assert(br.Offset == 0);

                    SetNewSuffixStart(term, br);

                    SurrogateDance();

                    Term t2 = termEnum.Term();
                    if (t2 == null || t2.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        Debug.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        current = null;
                        return(SeekStatus.END);
                    }
                    else
                    {
                        current = t2.Bytes;
                        Debug.Assert(!unicodeSortOrder || term.CompareTo(current) < 0, "term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " vs current=" + UnicodeUtil.ToHexString(current.Utf8ToString()));
                        return(SeekStatus.NOT_FOUND);
                    }
                }
            }
Пример #38
0
        public void TestRandomIndex()
        {
            Directory    dir      = NewDirectory();
            MockAnalyzer analyzer = new MockAnalyzer(Random());

            analyzer.MaxTokenLength = TestUtil.NextInt(Random(), 1, IndexWriter.MAX_TERM_LENGTH);
            RandomIndexWriter w = new RandomIndexWriter(Random(), dir, analyzer, Similarity, TimeZone);

            CreateRandomIndex(AtLeast(50), w, Random().NextLong());
            DirectoryReader reader       = w.Reader;
            AtomicReader    wrapper      = SlowCompositeReaderWrapper.Wrap(reader);
            string          field        = @"body";
            Terms           terms        = wrapper.GetTerms(field);
            var             lowFreqQueue = new AnonymousPriorityQueue(this, 5);

            Util.PriorityQueue <TermAndFreq> highFreqQueue = new AnonymousPriorityQueue1(this, 5);
            try
            {
                TermsEnum iterator = terms.GetIterator(null);
                while (iterator.Next() != null)
                {
                    if (highFreqQueue.Count < 5)
                    {
                        highFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq));
                        lowFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq));
                    }
                    else
                    {
                        if (highFreqQueue.Top.freq < iterator.DocFreq)
                        {
                            highFreqQueue.Top.freq = iterator.DocFreq;
                            highFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term);
                            highFreqQueue.UpdateTop();
                        }

                        if (lowFreqQueue.Top.freq > iterator.DocFreq)
                        {
                            lowFreqQueue.Top.freq = iterator.DocFreq;
                            lowFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term);
                            lowFreqQueue.UpdateTop();
                        }
                    }
                }

                int lowFreq  = lowFreqQueue.Top.freq;
                int highFreq = highFreqQueue.Top.freq;
                AssumeTrue(@"unlucky index", highFreq - 1 > lowFreq);
                List <TermAndFreq> highTerms  = QueueToList(highFreqQueue);
                List <TermAndFreq> lowTerms   = QueueToList(lowFreqQueue);
                IndexSearcher      searcher   = NewSearcher(reader);
                Occur            lowFreqOccur = RandomOccur(Random());
                BooleanQuery     verifyQuery  = new BooleanQuery();
                CommonTermsQuery cq           = new CommonTermsQuery(RandomOccur(Random()), lowFreqOccur, highFreq - 1, Random().NextBoolean());
                foreach (TermAndFreq termAndFreq in lowTerms)
                {
                    cq.Add(new Term(field, termAndFreq.term));
                    verifyQuery.Add(new BooleanClause(new TermQuery(new Term(field, termAndFreq.term)), lowFreqOccur));
                }

                foreach (TermAndFreq termAndFreq in highTerms)
                {
                    cq.Add(new Term(field, termAndFreq.term));
                }

                TopDocs cqSearch     = searcher.Search(cq, reader.MaxDoc);
                TopDocs verifySearch = searcher.Search(verifyQuery, reader.MaxDoc);
                assertEquals(verifySearch.TotalHits, cqSearch.TotalHits);
                var hits = new HashSet <int>();
                foreach (ScoreDoc doc in verifySearch.ScoreDocs)
                {
                    hits.Add(doc.Doc);
                }

                foreach (ScoreDoc doc in cqSearch.ScoreDocs)
                {
                    assertTrue(hits.Remove(doc.Doc));
                }

                assertTrue(hits.Count == 0);
                w.ForceMerge(1);
                DirectoryReader reader2 = w.Reader;
                QueryUtils.Check(Random(), cq, NewSearcher(reader2), Similarity);
                reader2.Dispose();
            }
            finally
            {
                reader.Dispose();
                wrapper.Dispose();
                w.Dispose();
                dir.Dispose();
            }
        }
Пример #39
0
 public TermAndFreq(BytesRef term, int freq)
 {
     this.term = term;
     this.freq = freq;
 }
Пример #40
0
 public override bool IsIndexTerm(BytesRef term, TermStats stats)
 {
     return(rand.Next(gap) == gap / 2);
 }
Пример #41
0
            public override SeekStatus SeekCeil(BytesRef target)
            {
                // already here
                if (term != null && term.Equals(target))
                {
                    return(SeekStatus.FOUND);
                }

                int startIdx = Array.BinarySearch(outerInstance.m_indexedTermsArray, target);

                if (startIdx >= 0)
                {
                    // we hit the term exactly... lucky us!
                    TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target);
                    Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND);
                    ord = startIdx << outerInstance.indexIntervalBits;
                    SetTerm();
                    Debug.Assert(term != null);
                    return(SeekStatus.FOUND);
                }

                // we didn't hit the term exactly
                startIdx = -startIdx - 1;

                if (startIdx == 0)
                {
                    // our target occurs *before* the first term
                    TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target);
                    Debug.Assert(seekStatus == TermsEnum.SeekStatus.NOT_FOUND);
                    ord = 0;
                    SetTerm();
                    Debug.Assert(term != null);
                    return(SeekStatus.NOT_FOUND);
                }

                // back up to the start of the block
                startIdx--;

                if ((ord >> outerInstance.indexIntervalBits) == startIdx && term != null && term.CompareTo(target) <= 0)
                {
                    // we are already in the right block and the current term is before the term we want,
                    // so we don't need to seek.
                }
                else
                {
                    // seek to the right block
                    TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(outerInstance.m_indexedTermsArray[startIdx]);
                    Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND);
                    ord = startIdx << outerInstance.indexIntervalBits;
                    SetTerm();
                    Debug.Assert(term != null); // should be non-null since it's in the index
                }

                while (term != null && term.CompareTo(target) < 0)
                {
                    Next();
                }

                if (term == null)
                {
                    return(SeekStatus.END);
                }
                else if (term.CompareTo(target) == 0)
                {
                    return(SeekStatus.FOUND);
                }
                else
                {
                    return(SeekStatus.NOT_FOUND);
                }
            }
Пример #42
0
        public virtual void TestSimple()
        {
            int numNodes = TestUtil.NextInt32(Random, 1, 10);

            double runTimeSec = AtLeast(3);

            int minDocsToMakeTerms = TestUtil.NextInt32(Random, 5, 20);

            int maxSearcherAgeSeconds = TestUtil.NextInt32(Random, 1, 3);

            if (Verbose)
            {
                Console.WriteLine("TEST: numNodes=" + numNodes + " runTimeSec=" + runTimeSec + " maxSearcherAgeSeconds=" + maxSearcherAgeSeconds);
            }

            Start(numNodes, runTimeSec, maxSearcherAgeSeconds);

            List <PreviousSearchState> priorSearches = new List <PreviousSearchState>();
            List <BytesRef>            terms         = null;

            while (Time.NanoTime() < endTimeNanos)
            {
                bool doFollowon = priorSearches.Count > 0 && Random.Next(7) == 1;

                // Pick a random node; we will run the query on this node:
                int myNodeID = Random.Next(numNodes);

                NodeState.ShardIndexSearcher localShardSearcher;

                PreviousSearchState prevSearchState;

                if (doFollowon)
                {
                    // Pretend user issued a followon query:
                    prevSearchState = priorSearches[Random.Next(priorSearches.Count)];

                    if (Verbose)
                    {
                        Console.WriteLine("\nTEST: follow-on query age=" + ((Time.NanoTime() - prevSearchState.SearchTimeNanos) / 1000000000.0));
                    }

                    try
                    {
                        localShardSearcher = m_nodes[myNodeID].Acquire(prevSearchState.Versions);
                    }
                    catch (SearcherExpiredException see)
                    {
                        // Expected, sometimes; in a "real" app we would
                        // either forward this error to the user ("too
                        // much time has passed; please re-run your
                        // search") or sneakily just switch to newest
                        // searcher w/o telling them...
                        if (Verbose)
                        {
                            Console.WriteLine("  searcher expired during local shard searcher init: " + see);
                        }
                        priorSearches.Remove(prevSearchState);
                        continue;
                    }
                }
                else
                {
                    if (Verbose)
                    {
                        Console.WriteLine("\nTEST: fresh query");
                    }
                    // Do fresh query:
                    localShardSearcher = m_nodes[myNodeID].Acquire();
                    prevSearchState    = null;
                }

                IndexReader[] subs = new IndexReader[numNodes];

                PreviousSearchState searchState = null;

                try
                {
                    // Mock: now make a single reader (MultiReader) from all node
                    // searchers.  In a real shard env you can't do this... we
                    // do it to confirm results from the shard searcher
                    // are correct:
                    int docCount = 0;
                    try
                    {
                        for (int nodeID = 0; nodeID < numNodes; nodeID++)
                        {
                            long          subVersion = localShardSearcher.GetNodeVersions()[nodeID];
                            IndexSearcher sub        = m_nodes[nodeID].Searchers.Acquire(subVersion);
                            if (sub == null)
                            {
                                nodeID--;
                                while (nodeID >= 0)
                                {
                                    subs[nodeID].DecRef();
                                    subs[nodeID] = null;
                                    nodeID--;
                                }
                                throw new SearcherExpiredException("nodeID=" + nodeID + " version=" + subVersion);
                            }
                            subs[nodeID] = sub.IndexReader;
                            docCount    += subs[nodeID].MaxDoc;
                        }
                    }
                    catch (SearcherExpiredException see)
                    {
                        // Expected
                        if (Verbose)
                        {
                            Console.WriteLine("  searcher expired during mock reader init: " + see);
                        }
                        continue;
                    }

                    IndexReader   mockReader   = new MultiReader(subs);
                    IndexSearcher mockSearcher = new IndexSearcher(mockReader);

                    Query query;
                    Sort  sort;

                    if (prevSearchState != null)
                    {
                        query = prevSearchState.Query;
                        sort  = prevSearchState.Sort;
                    }
                    else
                    {
                        if (terms == null && docCount > minDocsToMakeTerms)
                        {
                            // TODO: try to "focus" on high freq terms sometimes too
                            // TODO: maybe also periodically reset the terms...?
                            TermsEnum termsEnum = MultiFields.GetTerms(mockReader, "body").GetEnumerator();
                            terms = new List <BytesRef>();
                            while (termsEnum.MoveNext())
                            {
                                terms.Add(BytesRef.DeepCopyOf(termsEnum.Term));
                            }
                            if (Verbose)
                            {
                                Console.WriteLine("TEST: init terms: " + terms.Count + " terms");
                            }
                            if (terms.Count == 0)
                            {
                                terms = null;
                            }
                        }

                        if (Verbose)
                        {
                            Console.WriteLine("  maxDoc=" + mockReader.MaxDoc);
                        }

                        if (terms != null)
                        {
                            if (Random.NextBoolean())
                            {
                                query = new TermQuery(new Term("body", terms[Random.Next(terms.Count)]));
                            }
                            else
                            {
                                string t = terms[Random.Next(terms.Count)].Utf8ToString();
                                string prefix;
                                if (t.Length <= 1)
                                {
                                    prefix = t;
                                }
                                else
                                {
                                    prefix = t.Substring(0, TestUtil.NextInt32(Random, 1, 2));
                                }
                                query = new PrefixQuery(new Term("body", prefix));
                            }

                            if (Random.NextBoolean())
                            {
                                sort = null;
                            }
                            else
                            {
                                // TODO: sort by more than 1 field
                                int what = Random.Next(3);
                                if (what == 0)
                                {
                                    sort = new Sort(SortField.FIELD_SCORE);
                                }
                                else if (what == 1)
                                {
                                    // TODO: this sort doesn't merge
                                    // correctly... it's tricky because you
                                    // could have > 2.1B docs across all shards:
                                    //sort = new Sort(SortField.FIELD_DOC);
                                    sort = null;
                                }
                                else if (what == 2)
                                {
                                    sort = new Sort(new SortField[] { new SortField("docid", SortFieldType.INT32, Random.NextBoolean()) });
                                }
                                else
                                {
                                    sort = new Sort(new SortField[] { new SortField("title", SortFieldType.STRING, Random.NextBoolean()) });
                                }
                            }
                        }
                        else
                        {
                            query = null;
                            sort  = null;
                        }
                    }

                    if (query != null)
                    {
                        try
                        {
                            searchState = AssertSame(mockSearcher, localShardSearcher, query, sort, prevSearchState);
                        }
                        catch (SearcherExpiredException see)
                        {
                            // Expected; in a "real" app we would
                            // either forward this error to the user ("too
                            // much time has passed; please re-run your
                            // search") or sneakily just switch to newest
                            // searcher w/o telling them...
                            if (Verbose)
                            {
                                Console.WriteLine("  searcher expired during search: " + see);
                                Console.Out.Write(see.StackTrace);
                            }
                            // We can't do this in general: on a very slow
                            // computer it's possible the local searcher
                            // expires before we can finish our search:
                            // assert prevSearchState != null;
                            if (prevSearchState != null)
                            {
                                priorSearches.Remove(prevSearchState);
                            }
                        }
                    }
                }
                finally
                {
                    m_nodes[myNodeID].Release(localShardSearcher);
                    foreach (IndexReader sub in subs)
                    {
                        if (sub != null)
                        {
                            sub.DecRef();
                        }
                    }
                }

                if (searchState != null && searchState.SearchAfterLocal != null && Random.Next(5) == 3)
                {
                    priorSearches.Add(searchState);
                    if (priorSearches.Count > 200)
                    {
                        priorSearches.Shuffle(Random);
                        priorSearches.SubList(100, priorSearches.Count).Clear();
                    }
                }
            }

            Finish();
        }
Пример #43
0
 /// <summary>
 /// Inverts only terms starting w/ prefix, and only terms
 /// whose docFreq (not taking deletions into account) is
 /// &lt;=  <paramref name="maxTermDocFreq"/>, with a custom indexing interval
 /// (default is every 128nd term).
 /// </summary>
 public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits)
     : this(field, maxTermDocFreq, indexIntervalBits)
 {
     Uninvert(reader, liveDocs, termPrefix);
 }
Пример #44
0
            /// <summary>
            /// Builds an <see cref="SynonymMap"/> and returns it.
            /// </summary>
            public virtual SynonymMap Build()
            {
                ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
                // TODO: are we using the best sharing options?
                var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

                BytesRef            scratch       = new BytesRef(64);
                ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

                ISet <int?> dedupSet;

                if (dedup)
                {
                    dedupSet = new JCG.HashSet <int?>();
                }
                else
                {
                    dedupSet = null;
                }


                var spare = new byte[5];

                ICollection <CharsRef> keys = workingSet.Keys;

                CharsRef[] sortedKeys = keys.ToArray();
#pragma warning disable 612, 618
                System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer);
#pragma warning restore 612, 618


                Int32sRef scratchIntsRef = new Int32sRef();

                //System.out.println("fmap.build");
                for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
                {
                    CharsRef input  = sortedKeys[keyIdx];
                    MapEntry output = workingSet[input];

                    int numEntries = output.ords.Count;
                    // output size, assume the worst case
                    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

                    scratch.Grow(estimatedSize);
                    scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length);
                    Debug.Assert(scratch.Offset == 0);

                    // now write our output data:
                    int count = 0;
                    for (int i = 0; i < numEntries; i++)
                    {
                        if (dedupSet != null)
                        {
                            // box once
                            int?ent = output.ords[i];
                            if (dedupSet.Contains(ent))
                            {
                                continue;
                            }
                            dedupSet.Add(ent);
                        }
                        scratchOutput.WriteVInt32(output.ords[i]);
                        count++;
                    }

                    int pos = scratchOutput.Position;
                    scratchOutput.WriteVInt32(count << 1 | (output.includeOrig ? 0 : 1));
                    int pos2    = scratchOutput.Position;
                    int vIntLen = pos2 - pos;

                    // Move the count + includeOrig to the front of the byte[]:
                    Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen);
                    Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos);
                    Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen);

                    if (dedupSet != null)
                    {
                        dedupSet.Clear();
                    }

                    scratch.Length = scratchOutput.Position - scratch.Offset;
                    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
                    builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch));
                }

                FST <BytesRef> fst = builder.Finish();
                return(new SynonymMap(fst, words, maxHorizontalContext));
            }
Пример #45
0
 /// <summary>
 /// Inverts only terms starting w/ prefix, and only terms
 /// whose docFreq (not taking deletions into account) is
 /// &lt;= <paramref name="maxTermDocFreq"/>
 /// </summary>
 public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq)
     : this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS)
 {
 }
Пример #46
0
 public override void FinishTerm(BytesRef term, TermStats stats)
 {
 }
Пример #47
0
 /// <summary>
 /// Inverts only terms starting w/ prefix </summary>
 public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix)
     : this(reader, liveDocs, field, termPrefix, int.MaxValue)
 {
 }
Пример #48
0
 public override PostingsConsumer StartTerm(BytesRef term)
 {
     return(_postingsWriter.Reset(term));
 }
Пример #49
0
            // Look for seek type 3 ("pop"): if the delta from
            // prev -> current was replacing an S with an E,
            // we must now seek to beyond that E.  this seek
            // "finishes" the dance at this character
            // position.
            private bool DoPop()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try pop");
                }

                Debug.Assert(newSuffixStart <= prevTerm.Length);
                Debug.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0);

                if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart))
                {
                    // Seek type 2 -- put 0xFF at this position:
                    scratchTerm.Bytes[newSuffixStart] = 0xff;
                    scratchTerm.Length = newSuffixStart + 1;

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("    seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString());
                    }

                    // TODO: more efficient seek?  can we simply swap
                    // the enums?
                    outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true);

                    Term t2 = termEnum.Term();

                    // We could hit EOF or different field since this
                    // was a seek "forward":
                    if (t2 != null && t2.Field == internedFieldName)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes);
                        }

                        BytesRef b2 = t2.Bytes;
                        Debug.Assert(b2.Offset == 0);

                        // Set newSuffixStart -- we can't use
                        // termEnum's since the above seek may have
                        // done no scanning (eg, term was precisely
                        // and index term, or, was in the term seek
                        // cache):
                        scratchTerm.CopyBytes(b2);
                        SetNewSuffixStart(prevTerm, scratchTerm);

                        return(true);
                    }
                    else if (newSuffixStart != 0 || scratchTerm.Length != 0)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=null (or next field)");
                        }
                        newSuffixStart     = 0;
                        scratchTerm.Length = 0;
                        return(true);
                    }
                }

                return(false);
            }
Пример #50
0
            public override BytesRef Next()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("TE.next()");
                }
                if (skipNext)
                {
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  skipNext=true");
                    }
                    skipNext = false;
                    if (termEnum.Term() == null)
                    {
                        return(null);
                        // PreFlex codec interns field names:
                    }
                    else if (termEnum.Term().Field != internedFieldName)
                    {
                        return(null);
                    }
                    else
                    {
                        return(current = termEnum.Term().Bytes);
                    }
                }

                // TODO: can we use STE's prevBuffer here?
                prevTerm.CopyBytes(termEnum.Term().Bytes);

                if (termEnum.Next() && termEnum.Term().Field == internedFieldName)
                {
                    newSuffixStart = termEnum.newSuffixStart;
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  newSuffixStart=" + newSuffixStart);
                    }
                    SurrogateDance();
                    Term t = termEnum.Term();
                    if (t == null || t.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        Debug.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        current = null;
                    }
                    else
                    {
                        current = t.Bytes;
                    }
                    return(current);
                }
                else
                {
                    // this field is exhausted, but we have to give
                    // surrogateDance a chance to seek back:
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  force cont");
                    }
                    //newSuffixStart = prevTerm.length;
                    newSuffixStart = 0;
                    SurrogateDance();

                    Term t = termEnum.Term();
                    if (t == null || t.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        Debug.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        return(null);
                    }
                    else
                    {
                        current = t.Bytes;
                        return(current);
                    }
                }
            }
Пример #51
0
 public TermFreqValueSource(string field, string val, string indexedField, BytesRef indexedBytes)
     : base(field, val, indexedField, indexedBytes)
 {
 }
Пример #52
0
 public override void LookupOrd(int ord, BytesRef result)
 {
     @in.LookupOrd(ord, result);
 }
Пример #53
0
 private void InitializeInstanceFields()
 {
     p = new BytesRef(data, 0, 1);
 }
Пример #54
0
        public virtual void Test()
        {
            int[]   ints  = new int[7];
            IntsRef input = new IntsRef(ints, 0, ints.Length);
            int     seed  = Random().Next();

            Directory dir = new MMapDirectory(CreateTempDir("2BFST"));

            for (int doPackIter = 0; doPackIter < 2; doPackIter++)
            {
                bool doPack = doPackIter == 1;

                // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
                if (!doPack)
                {
                    Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
                    Outputs <object> outputs   = NoOutputs.Singleton;
                    object           NO_OUTPUT = outputs.NoOutput;
                    Builder <object> b         = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15);

                    int     count  = 0;
                    Random  r      = new Random(seed);
                    int[]   ints2  = new int[200];
                    IntsRef input2 = new IntsRef(ints2, 0, ints2.Length);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        for (int i = 10; i < ints2.Length; i++)
                        {
                            ints2[i] = r.Next(256);
                        }
                        b.Add(input2, NO_OUTPUT);
                        count++;
                        if (count % 100000 == 0)
                        {
                            Console.WriteLine(count + ": " + b.FstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes");
                        }
                        if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024)
                        {
                            break;
                        }
                        NextInput(r, ints2);
                    }

                    FST <object> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2));
                            NextInput(r, ints2);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        IntsRefFSTEnum <object> fstEnum = new IntsRefFSTEnum <object>(fst);

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            IntsRefFSTEnum <object> .InputOutput <object> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(input2, pair.Input);
                            Assert.AreEqual(NO_OUTPUT, pair.Output);
                            upto++;
                            NextInput(r, ints2);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST <object>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ ByteSequenceOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes");
                    Outputs <BytesRef> outputs = ByteSequenceOutputs.Singleton;
                    Builder <BytesRef> b       = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15);

                    var      outputBytes = new byte[20];
                    BytesRef output      = new BytesRef(outputBytes);
                    Arrays.Fill(ints, 0);
                    int    count = 0;
                    Random r     = new Random(seed);
                    while (true)
                    {
                        r.NextBytes(outputBytes);
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, BytesRef.DeepCopyOf(output));
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes");
                        }
                        if (b.FstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST <BytesRef> fst = b.Finish();
                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        r = new Random(seed);
                        Arrays.Fill(ints, 0);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            r.NextBytes((byte[])(Array)outputBytes);
                            Assert.AreEqual(output, Util.Get(fst, input));
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        IntsRefFSTEnum <BytesRef> fstEnum = new IntsRefFSTEnum <BytesRef>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            IntsRefFSTEnum <BytesRef> .InputOutput <BytesRef> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            r.NextBytes((byte[])(Array)outputBytes);
                            Assert.AreEqual(output, pair.Output);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST <BytesRef>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ PositiveIntOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long");
                    Outputs <long?> outputs = PositiveIntOutputs.Singleton;
                    Builder <long?> b       = new Builder <long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15);

                    long output = 1;

                    Arrays.Fill(ints, 0);
                    int    count = 0;
                    Random r     = new Random(seed);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, output);
                        output += 1 + r.Next(10);
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes");
                        }
                        if (b.FstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST <long?> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints, 0);

                        output = 1;
                        r      = new Random(seed);
                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }

                            // forward lookup:
                            Assert.AreEqual(output, (long)Util.Get(fst, input));
                            // reverse lookup:
                            Assert.AreEqual(input, Util.GetByOutput(fst, output));
                            output += 1 + r.Next(10);
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        IntsRefFSTEnum <long?> fstEnum = new IntsRefFSTEnum <long?>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        output = 1;
                        while (true)
                        {
                            IntsRefFSTEnum <long?> .InputOutput <long?> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            Assert.AreEqual(output, pair.Output.Value);
                            output += 1 + r.Next(10);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST <long?>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }
            }
            dir.Dispose();
        }
Пример #55
0
        public virtual void Test2BOrds()
        {
            BaseDirectoryWrapper dir = NewFSDirectory(CreateTempDir("2BOrds"));

            if (dir is MockDirectoryWrapper)
            {
                ((MockDirectoryWrapper)dir).Throttling = MockDirectoryWrapper.Throttling_e.NEVER;
            }

            IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))
                                            .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH).SetRAMBufferSizeMB(256.0).SetMergeScheduler(new ConcurrentMergeScheduler()).SetMergePolicy(NewLogMergePolicy(false, 10)).SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE));

            Document doc = new Document();

            sbyte[]              bytes   = new sbyte[4];
            BytesRef             data    = new BytesRef(bytes);
            SortedDocValuesField dvField = new SortedDocValuesField("dv", data);

            doc.Add(dvField);

            for (int i = 0; i < int.MaxValue; i++)
            {
                bytes[0] = (sbyte)(i >> 24);
                bytes[1] = (sbyte)(i >> 16);
                bytes[2] = (sbyte)(i >> 8);
                bytes[3] = (sbyte)i;
                w.AddDocument(doc);
                if (i % 100000 == 0)
                {
                    Console.WriteLine("indexed: " + i);
                    Console.Out.Flush();
                }
            }

            w.ForceMerge(1);
            w.Dispose();

            Console.WriteLine("verifying...");
            Console.Out.Flush();

            DirectoryReader r       = DirectoryReader.Open(dir);
            int             counter = 0;

            foreach (AtomicReaderContext context in r.Leaves)
            {
                AtomicReader    reader  = context.AtomicReader;
                BytesRef        scratch = new BytesRef();
                BinaryDocValues dv      = reader.GetSortedDocValues("dv");
                for (int i = 0; i < reader.MaxDoc; i++)
                {
                    bytes[0] = (sbyte)(counter >> 24);
                    bytes[1] = (sbyte)(counter >> 16);
                    bytes[2] = (sbyte)(counter >> 8);
                    bytes[3] = (sbyte)counter;
                    counter++;
                    dv.Get(i, scratch);
                    Assert.AreEqual(data, scratch);
                }
            }

            r.Dispose();
            dir.Dispose();
        }
Пример #56
0
 public override int LookupTerm(BytesRef key)
 {
     return((int)@in.LookupTerm(key));
 }
Пример #57
0
            public override bool Collect(BytesRef bytes)
            {
                float boost = boostAtt.Boost;

                // make sure within a single seg we always collect
                // terms in order
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(CompareToLastTerm(bytes));
                }

                //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
                // ignore uncompetitive hits
                if (stQueue.Count == maxSize)
                {
                    ScoreTerm t = stQueue.Peek();
                    if (boost < t.Boost)
                    {
                        return(true);
                    }
                    if (boost == t.Boost && termComp.Compare(bytes, t.Bytes) > 0)
                    {
                        return(true);
                    }
                }
                TermState state = termsEnum.GetTermState();

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(state != null);
                }
                if (visitedTerms.TryGetValue(bytes, out ScoreTerm t2))
                {
                    // if the term is already in the PQ, only update docFreq of term in PQ
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(t2.Boost == boost, "boost should be equal in all segment TermsEnums");
                    }
                    t2.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                }
                else
                {
                    // add new entry in PQ, we must clone the term, else it may get overwritten!
                    st.Bytes.CopyBytes(bytes);
                    st.Boost = boost;
                    visitedTerms[st.Bytes] = st;
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(st.TermState.DocFreq == 0);
                    }
                    st.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                    stQueue.Add(st);
                    // possibly drop entries from queue
                    if (stQueue.Count > maxSize)
                    {
                        st = stQueue.Dequeue();
                        visitedTerms.Remove(st.Bytes);
                        st.TermState.Clear(); // reset the termstate!
                    }
                    else
                    {
                        st = new ScoreTerm(termComp, new TermContext(m_topReaderContext));
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(stQueue.Count <= maxSize, "the PQ size must be limited to maxSize");
                    }
                    // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                    if (stQueue.Count == maxSize)
                    {
                        t2 = stQueue.Peek();
                        maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost;
                        maxBoostAtt.CompetitiveTerm        = t2.Bytes;
                    }
                }

                return(true);
            }
Пример #58
0
 public override bool BytesVal(int doc, BytesRef target)
 {
     target.CopyBytes(outerInstance.m_bytesRef);
     return(true);
 }
Пример #59
0
        public virtual void Test([ValueSource(typeof(ConcurrentMergeSchedulers), "Values")] IConcurrentMergeScheduler scheduler)
        {
            MockDirectoryWrapper dir = new MockDirectoryWrapper(Random(), new MMapDirectory(CreateTempDir("4GBStoredFields")));

            dir.Throttling = MockDirectoryWrapper.Throttling_e.NEVER;

            var config = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))
                         .SetMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                         .SetRAMBufferSizeMB(256.0)
                         .SetMergeScheduler(scheduler)
                         .SetMergePolicy(NewLogMergePolicy(false, 10))
                         .SetOpenMode(IndexWriterConfig.OpenMode_e.CREATE);
            IndexWriter w = new IndexWriter(dir, config);

            MergePolicy mp = w.Config.MergePolicy;

            if (mp is LogByteSizeMergePolicy)
            {
                // 1 petabyte:
                ((LogByteSizeMergePolicy)mp).MaxMergeMB = 1024 * 1024 * 1024;
            }

            Document  doc = new Document();
            FieldType ft  = new FieldType();

            ft.Indexed = false;
            ft.Stored  = true;
            ft.Freeze();
            int valueLength = RandomInts.NextIntBetween(Random(), 1 << 13, 1 << 20);
            var value       = new byte[valueLength];

            for (int i = 0; i < valueLength; ++i)
            {
                // random so that even compressing codecs can't compress it
                value[i] = (byte)Random().Next(256);
            }
            Field f = new Field("fld", value, ft);

            doc.Add(f);

            int numDocs = (int)((1L << 32) / valueLength + 100);

            for (int i = 0; i < numDocs; ++i)
            {
                w.AddDocument(doc);
                if (VERBOSE && i % (numDocs / 10) == 0)
                {
                    Console.WriteLine(i + " of " + numDocs + "...");
                }
            }
            w.ForceMerge(1);
            w.Dispose();
            if (VERBOSE)
            {
                bool found = false;
                foreach (string file in dir.ListAll())
                {
                    if (file.EndsWith(".fdt"))
                    {
                        long fileLength = dir.FileLength(file);
                        if (fileLength >= 1L << 32)
                        {
                            found = true;
                        }
                        Console.WriteLine("File length of " + file + " : " + fileLength);
                    }
                }
                if (!found)
                {
                    Console.WriteLine("No .fdt file larger than 4GB, test bug?");
                }
            }

            DirectoryReader rd = DirectoryReader.Open(dir);
            Document        sd = rd.Document(numDocs - 1);

            Assert.IsNotNull(sd);
            Assert.AreEqual(1, sd.Fields.Count);
            BytesRef valueRef = sd.GetBinaryValue("fld");

            Assert.IsNotNull(valueRef);
            Assert.AreEqual(new BytesRef(value), valueRef);
            rd.Dispose();

            dir.Dispose();
        }
Пример #60
0
        /// <summary>
        /// Provide spelling corrections based on several parameters.
        /// </summary>
        /// <param name="term"> The term to suggest spelling corrections for </param>
        /// <param name="numSug"> The maximum number of spelling corrections </param>
        /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param>
        /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param>
        /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param>
        /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param>
        /// <param name="spare"> a chars scratch </param>
        /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns>
        /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception>
        protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir,
                                                                          int docfreq, int editDistance, float accuracy, CharsRef spare)
        {
            var atts = new AttributeSource();
            IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
            Terms terms = MultiFields.GetTerms(ir, term.Field);

            if (terms == null)
            {
                return(new List <ScoreTerm>());
            }
            FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true);

            var stQueue = new Support.PriorityQueue <ScoreTerm>();

            BytesRef        queryTerm = new BytesRef(term.Text());
            BytesRef        candidateTerm;
            ScoreTerm       st       = new ScoreTerm();
            IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>();

            while ((candidateTerm = e.Next()) != null)
            {
                float boost = boostAtt.Boost;
                // ignore uncompetitive hits
                if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost)
                {
                    continue;
                }

                // ignore exact match of the same term
                if (queryTerm.BytesEquals(candidateTerm))
                {
                    continue;
                }

                int df = e.DocFreq;

                // check docFreq if required
                if (df <= docfreq)
                {
                    continue;
                }

                float  score;
                string termAsString;
                if (distance == INTERNAL_LEVENSHTEIN)
                {
                    // delay creating strings until the end
                    termAsString = null;
                    // undo FuzzyTermsEnum's scale factor for a real scaled lev score
                    score = boost / e.ScaleFactor + e.MinSimilarity;
                }
                else
                {
                    UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
                    termAsString = spare.ToString();
                    score        = distance.GetDistance(term.Text(), termAsString);
                }

                if (score < accuracy)
                {
                    continue;
                }

                // add new entry in PQ
                st.Term         = BytesRef.DeepCopyOf(candidateTerm);
                st.Boost        = boost;
                st.Docfreq      = df;
                st.TermAsString = termAsString;
                st.Score        = score;
                stQueue.Offer(st);
                // possibly drop entries from queue
                st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm();
                maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity;
            }

            return(stQueue);
        }