Beispiel #1
0
        /// <summary>
        /// Find the stem(s) of the provided word
        /// </summary>
        /// <param name="word"> Word to find the stems for </param>
        /// <returns> List of stems for the word </returns>
        public IList <CharsRef> Stem(char[] word, int length)
        {
            if (dictionary.needsInputCleaning)
            {
                scratchSegment.Length = 0;
                scratchSegment.Append(word, 0, length);
                string cleaned = dictionary.CleanInput(scratchSegment.ToString(), segment);
                scratchBuffer = ArrayUtil.Grow(scratchBuffer, cleaned.Length);
                length        = segment.Length;
                segment.CopyTo(0, scratchBuffer, 0, length);
                word = scratchBuffer;
            }

            List <CharsRef> stems = new List <CharsRef>();
            IntsRef         forms = dictionary.LookupWord(word, 0, length);

            if (forms != null)
            {
                // TODO: some forms should not be added, e.g. ONLYINCOMPOUND
                // just because it exists, does not make it valid...
                for (int i = 0; i < forms.Length; i++)
                {
                    stems.Add(NewStem(word, length));
                }
            }
            stems.AddRange(Stem(word, length, -1, -1, -1, 0, true, true, false, false));
            return(stems);
        }
Beispiel #2
0
            protected override bool AcceptResult(IntsRef input, PairOutputs <long?, BytesRef> .Pair output)
            {
                // Dedup: when the input analyzes to a graph we
                // can get duplicate surface forms:
                if (seen.Contains(output.Output2))
                {
                    return(false);
                }
                seen.Add(output.Output2);

                if (!outerInstance.exactFirst)
                {
                    return(true);
                }
                else
                {
                    // In exactFirst mode, don't accept any paths
                    // matching the surface form since that will
                    // create duplicate results:
                    if (outerInstance.SameSurfaceForm(utf8Key, output.Output2))
                    {
                        // We found exact match, which means we should
                        // have already found it in the first search:
                        Debug.Assert(results.Count == 1);
                        return(false);
                    }
                    else
                    {
                        return(true);
                    }
                }
            }
Beispiel #3
0
            /// <summary>
            /// Builds the NormalizeCharMap; call this once you
            ///  are done calling <seealso cref="#add"/>.
            /// </summary>
            public virtual NormalizeCharMap build()
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.FST<org.apache.lucene.util.CharsRef> map;
                FST <CharsRef> map;

                try
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.Outputs<org.apache.lucene.util.CharsRef> outputs = org.apache.lucene.util.fst.CharSequenceOutputs.getSingleton();
                    Outputs <CharsRef> outputs = CharSequenceOutputs.Singleton;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.Builder<org.apache.lucene.util.CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(org.apache.lucene.util.fst.FST.INPUT_TYPE.BYTE2, outputs);
                    Builder <CharsRef> builder = new Builder <CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratch = new org.apache.lucene.util.IntsRef();
                    IntsRef scratch = new IntsRef();
                    foreach (KeyValuePair <string, string> ent in pendingPairs.SetOfKeyValuePairs())
                    {
                        builder.add(Util.toUTF16(ent.Key, scratch), new CharsRef(ent.Value));
                    }
                    map = builder.finish();
                    pendingPairs.Clear();
                }
                catch (IOException ioe)
                {
                    // Bogus FST IOExceptions!!  (will never happen)
                    throw new Exception(ioe);
                }

                return(new NormalizeCharMap(map));
            }
Beispiel #4
0
        public void TestListOfOutputs()
        {
            PositiveIntOutputs    _outputs = PositiveIntOutputs.Singleton;
            ListOfOutputs <long?> outputs  = new ListOfOutputs <long?>(_outputs);
            Builder <object>      builder  = new Builder <object>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE1, outputs);

            IntsRef scratch = new IntsRef();

            // Add the same input more than once and the outputs
            // are merged:
            builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 1L);
            builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 3L);
            builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 0L);
            builder.Add(Util.ToIntsRef(new BytesRef("b"), scratch), 17L);
            FST <object> fst = builder.Finish();

            object output = Util.Get(fst, new BytesRef("a"));

            assertNotNull(output);
            IList <long?> outputList = outputs.AsList(output);

            assertEquals(3, outputList.size());
            assertEquals(1L, outputList[0]);
            assertEquals(3L, outputList[1]);
            assertEquals(0L, outputList[2]);

            output = Util.Get(fst, new BytesRef("b"));
            assertNotNull(output);
            outputList = outputs.AsList(output);
            assertEquals(1, outputList.size());
            assertEquals(17L, outputList[0]);
        }
Beispiel #5
0
        private void TestRandomWords(int maxNumWords, int numIter)
        {
            Random random = new Random(Random().Next());

            for (int iter = 0; iter < numIter; iter++)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST: iter " + iter);
                }
                for (int inputMode = 0; inputMode < 2; inputMode++)
                {
                    int            numWords = random.nextInt(maxNumWords + 1);
                    ISet <IntsRef> termsSet = new HashSet <IntsRef>();
                    IntsRef[]      terms    = new IntsRef[numWords];
                    while (termsSet.size() < numWords)
                    {
                        string term = FSTTester <object> .GetRandomString(random);

                        termsSet.Add(FSTTester <object> .ToIntsRef(term, inputMode));
                    }
                    DoTest(inputMode, termsSet.ToArray());
                }
            }
        }
Beispiel #6
0
 /// <summary>
 /// Seeks to biggest term that's <= target. </summary>
 public IntsRefFSTEnum.InputOutput <T> SeekFloor(IntsRef target)
 {
     this.target  = target;
     targetLength = target.Length;
     base.DoSeekFloor();
     return(SetResult());
 }
Beispiel #7
0
	  /// <summary>
	  /// Find the stem(s) of the provided word
	  /// </summary>
	  /// <param name="word"> Word to find the stems for </param>
	  /// <returns> List of stems for the word </returns>
	  public IList<CharsRef> stem(char[] word, int length)
	  {

		if (dictionary.needsInputCleaning)
		{
		  scratchSegment.Length = 0;
		  scratchSegment.Append(word, 0, length);
		  CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
		  scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
		  length = segment.Length;
		  segment.getChars(0, length, scratchBuffer, 0);
		  word = scratchBuffer;
		}

		IList<CharsRef> stems = new List<CharsRef>();
		IntsRef forms = dictionary.lookupWord(word, 0, length);
		if (forms != null)
		{
		  // TODO: some forms should not be added, e.g. ONLYINCOMPOUND
		  // just because it exists, does not make it valid...
		  for (int i = 0; i < forms.length; i++)
		  {
			stems.Add(newStem(word, length));
		  }
		}
		stems.AddRange(stem(word, length, -1, -1, -1, 0, true, true, false, false));
		return stems;
	  }
        public override SortedDocValues GetSorted(FieldInfo field)
        {
            FSTEntry entry = fsts[field.Number];

            if (entry.numOrds == 0)
            {
                return(DocValues.EMPTY_SORTED);
            }
            FST <long?> instance;

            lock (this)
            {
                instance = fstInstances[field.Number];
                if (instance == null)
                {
                    data.Seek(entry.offset);
                    instance = new FST <long?>(data, PositiveIntOutputs.Singleton);
                    ramBytesUsed.AddAndGet(instance.SizeInBytes());
                    fstInstances[field.Number] = instance;
                }
            }
            var docToOrd = GetNumeric(field);
            var fst      = instance;

            // per-thread resources
            var @in         = fst.BytesReader;
            var firstArc    = new FST.Arc <long?>();
            var scratchArc  = new FST.Arc <long?>();
            var scratchInts = new IntsRef();
            var fstEnum     = new BytesRefFSTEnum <long?>(fst);

            return(new SortedDocValuesAnonymousInnerClassHelper(entry, docToOrd, fst, @in, firstArc, scratchArc,
                                                                scratchInts, fstEnum));
        }
Beispiel #9
0
 /// <summary>
 /// Returns the strings that can be produced from the given state, or
 /// false if more than <code>limit</code> strings are found.
 /// <code>limit</code>&lt;0 means "infinite".
 /// </summary>
 private static bool GetFiniteStrings(State s, HashSet <State> pathstates, HashSet <IntsRef> strings, IntsRef path, int limit)
 {
     pathstates.Add(s);
     foreach (Transition t in s.Transitions)
     {
         if (pathstates.Contains(t.To))
         {
             return(false);
         }
         for (int n = t.Min_Renamed; n <= t.Max_Renamed; n++)
         {
             path.Grow(path.Length + 1);
             path.Ints[path.Length] = n;
             path.Length++;
             if (t.To.accept)
             {
                 strings.Add(IntsRef.DeepCopyOf(path));
                 if (limit >= 0 && strings.Count > limit)
                 {
                     return(false);
                 }
             }
             if (!GetFiniteStrings(t.To, pathstates, strings, path, limit))
             {
                 return(false);
             }
             path.Length--;
         }
     }
     pathstates.Remove(s);
     return(true);
 }
        public override SortedDocValues GetSorted(FieldInfo field)
        {
            FSTEntry    entry = Fsts[field.Number];
            FST <long?> instance;

            lock (this)
            {
                if (!FstInstances.TryGetValue(field.Number, out instance))
                {
                    Data.Seek(entry.Offset);
                    instance = new FST <long?>(Data, PositiveIntOutputs.Singleton);
                    RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes());
                    FstInstances[field.Number] = instance;
                }
            }
            var docToOrd = GetNumeric(field);
            var fst      = instance;

            // per-thread resources
            var @in         = fst.BytesReader;
            var firstArc    = new FST.Arc <long?>();
            var scratchArc  = new FST.Arc <long?>();
            var scratchInts = new IntsRef();
            var fstEnum     = new BytesRefFSTEnum <long?>(fst);

            return(new SortedDocValuesAnonymousInnerClassHelper(entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum));
        }
Beispiel #11
0
 /// <summary>
 /// Seeks to smallest term that's >= target. </summary>
 public InputOutput <T> SeekCeil(IntsRef target)
 {
     this.Target  = target;
     TargetLength = target.Length;
     base.DoSeekCeil();
     return(SetResult());
 }
        public override SortedSetDocValues GetSortedSet(FieldInfo field)
        {
            FSTEntry entry = Fsts[field.Number];

            if (entry.NumOrds == 0)
            {
                return(DocValues.EMPTY_SORTED_SET); // empty FST!
            }
            FST <long?> instance;

            lock (this)
            {
                if (!FstInstances.TryGetValue(field.Number, out instance))
                {
                    Data.Seek(entry.Offset);
                    instance = new FST <long?>((DataInput)Data, Lucene.Net.Util.Fst.PositiveIntOutputs.Singleton);
                    RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes());
                    FstInstances[field.Number] = instance;
                }
            }
            BinaryDocValues docToOrds = GetBinary(field);
            FST <long?>     fst       = instance;

            // per-thread resources
            var @in         = fst.BytesReader;
            var firstArc    = new FST.Arc <long?>();
            var scratchArc  = new FST.Arc <long?>();
            var scratchInts = new IntsRef();
            var fstEnum     = new BytesRefFSTEnum <long?>(fst);
            var @ref        = new BytesRef();
            var input       = new ByteArrayDataInput();

            return(new SortedSetDocValuesAnonymousInnerClassHelper(entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input));
        }
Beispiel #13
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testSimpleDictionary() throws Exception
        public virtual void testSimpleDictionary()
        {
            System.IO.Stream affixStream = this.GetType().getResourceAsStream("simple.aff");
            System.IO.Stream dictStream  = this.GetType().getResourceAsStream("simple.dic");

            Dictionary dictionary = new Dictionary(affixStream, dictStream);

            assertEquals(3, dictionary.lookupSuffix(new char[] { 'e' }, 0, 1).length);
            assertEquals(1, dictionary.lookupPrefix(new char[] { 's' }, 0, 1).length);
            IntsRef ordList = dictionary.lookupWord(new char[] { 'o', 'l', 'r' }, 0, 3);

            assertNotNull(ordList);
            assertEquals(1, ordList.length);

            BytesRef @ref = new BytesRef();

            dictionary.flagLookup.get(ordList.ints[0], @ref);
            char[] flags = Dictionary.decodeFlags(@ref);
            assertEquals(1, flags.Length);

            ordList = dictionary.lookupWord(new char[] { 'l', 'u', 'c', 'e', 'n' }, 0, 5);
            assertNotNull(ordList);
            assertEquals(1, ordList.length);
            dictionary.flagLookup.get(ordList.ints[0], @ref);
            flags = Dictionary.decodeFlags(@ref);
            assertEquals(1, flags.Length);

            affixStream.Close();
            dictStream.Close();
        }
Beispiel #14
0
        public void testBasicFSA()
        {
            String[]  strings  = new String[] { "station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat" };
            String[]  strings2 = new String[] { "station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation" };
            IntsRef[] terms    = new IntsRef[strings.Length];
            IntsRef[] terms2   = new IntsRef[strings2.Length];
            for (int inputMode = 0; inputMode < 1; inputMode++) //TODO: inputMode=2
            {
                log.Debug("> inputMode={inputMode}", inputMode);

                for (int idx = 0; idx < strings.Length; idx++)
                {
                    terms[idx] = toIntsRef(strings[idx], inputMode);
                }
                for (int idx = 0; idx < strings2.Length; idx++)
                {
                    terms2[idx] = toIntsRef(strings2[idx], inputMode);
                }

                Array.Sort(terms);
                Array.Sort(terms2);

                Outputs <Object>             outputs   = NoOutputs.getSingleton();
                Object                       NO_OUTPUT = outputs.getNoOutput();
                List <InputOutput <Object> > pairs     = new List <InputOutput <object> >();
                foreach (IntsRef term in terms)
                {
                    pairs.Add(new InputOutput <object>(term, NO_OUTPUT));
                }
                new FSTTester <Object>(r, inputMode, pairs, outputs, false).doTest();
            }
        }
Beispiel #15
0
 public virtual void TestEmpty()
 {
     IntsRef i = new IntsRef();
     Assert.AreEqual(IntsRef.EMPTY_INTS, i.Ints);
     Assert.AreEqual(0, i.Offset);
     Assert.AreEqual(0, i.Length);
 }
Beispiel #16
0
        /// <summary>
        /// Looks up the output for this input, or null if the
        ///  input is not accepted.
        /// </summary>
        public static T Get <T>(FST <T> fst, IntsRef input)
        {
            // TODO: would be nice not to alloc this on every lookup
            var arc = fst.GetFirstArc(new FST <T> .Arc <T>());

            var fstReader = fst.BytesReader;

            // Accumulate output as we go
            T output = fst.Outputs.NoOutput;

            for (int i = 0; i < input.Length; i++)
            {
                if (fst.FindTargetArc(input.Ints[input.Offset + i], arc, arc, fstReader) == null)
                {
                    return(default(T));
                }
                output = fst.Outputs.Add(output, arc.Output);
            }

            if (arc.Final)
            {
                return(fst.Outputs.Add(output, arc.NextFinalOutput));
            }
            else
            {
                return(default(T));
            }
        }
Beispiel #17
0
        private FST <CharsRef> ParseConversions(TextReader reader, int num)
        {
            IDictionary <string, string> mappings = new SortedDictionary <string, string>();

            for (int i = 0; i < num; i++)
            {
                string   line  = reader.ReadLine();
                string[] parts = whitespacePattern.Split(line);
                if (parts.Length != 3)
                {
                    throw new Exception("invalid syntax: " + line /*, reader.LineNumber */); // LUCENENET TODO: LineNumberReader
                }
                if (mappings.Put(parts[1], parts[2]) != null)
                {
                    throw new System.InvalidOperationException("duplicate mapping specified for: " + parts[1]);
                }
            }

            Outputs <CharsRef> outputs     = CharSequenceOutputs.Singleton;
            Builder <CharsRef> builder     = new Builder <CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
            IntsRef            scratchInts = new IntsRef();

            foreach (KeyValuePair <string, string> entry in mappings)
            {
                Lucene.Net.Util.Fst.Util.ToUTF16(entry.Key, scratchInts);
                builder.Add(scratchInts, new CharsRef(entry.Value));
            }

            return(builder.Finish());
        }
        /// <summary>
        /// Builds the final automaton from a list of entries.
        /// </summary>
        private FST <object> BuildAutomaton(BytesRefSorter sorter)
        {
            // Build the automaton.
            Outputs <object> outputs = NoOutputs.Singleton;
            object           empty   = outputs.NoOutput;
            Builder <object> builder = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInts.DEFAULT, true, 15);

            BytesRef         scratch = new BytesRef();
            BytesRef         entry;
            IntsRef          scratchIntsRef = new IntsRef();
            int              count          = 0;
            BytesRefIterator iter           = sorter.GetEnumerator();

            while ((entry = iter.Next()) != null)
            {
                count++;
                if (scratch.CompareTo(entry) != 0)
                {
                    builder.Add(Util.Fst.Util.ToIntsRef(entry, scratchIntsRef), empty);
                    scratch.CopyBytes(entry);
                }
            }

            return(count == 0 ? null : builder.Finish());
        }
Beispiel #19
0
 /// <summary>
 /// Sole constructor. </summary>
 public Path(State state, FST.Arc <T> fstNode, T output, IntsRef input)
 {
     this.state   = state;
     this.fstNode = fstNode;
     this.output  = output;
     this.input   = input;
 }
Beispiel #20
0
            // If back plus this arc is competitive then add to queue:
            protected virtual void AddIfCompetitive(FSTPath <T> path)
            {
                Debug.Assert(Queue != null);

                T cost = Fst.Outputs.Add(path.Cost, path.Arc.Output);

                //System.out.println("  addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label);

                if (Queue.Count == MaxQueueDepth)
                {
                    FSTPath <T> bottom = Queue.Max;
                    int         comp   = Comparator.Compare(cost, bottom.Cost);
                    if (comp > 0)
                    {
                        // Doesn't compete
                        return;
                    }
                    else if (comp == 0)
                    {
                        // Tie break by alpha sort on the input:
                        path.Input.Grow(path.Input.Length + 1);
                        path.Input.Ints[path.Input.Length++] = path.Arc.Label;
                        int cmp = bottom.Input.CompareTo(path.Input);
                        path.Input.Length--;

                        // We should never see dups:
                        Debug.Assert(cmp != 0);

                        if (cmp < 0)
                        {
                            // Doesn't compete
                            return;
                        }
                    }
                    // Competes
                }
                else
                {
                    // Queue isn't full yet, so any path we hit competes:
                }

                // copy over the current input to the new input
                // and add the arc.label to the end
                IntsRef newInput = new IntsRef(path.Input.Length + 1);

                Array.Copy(path.Input.Ints, 0, newInput.Ints, 0, path.Input.Length);
                newInput.Ints[path.Input.Length] = path.Arc.Label;
                newInput.Length = path.Input.Length + 1;
                FSTPath <T> newPath = new FSTPath <T>(cost, path.Arc, newInput);

                Queue.Add(newPath);

                if (Queue.Count == MaxQueueDepth + 1)
                {
                    Queue.Last();
                }
            }
Beispiel #21
0
 /// <summary>
 /// Just takes unsigned byte values from the BytesRef and
 ///  converts into an IntsRef.
 /// </summary>
 public static IntsRef ToIntsRef(BytesRef input, IntsRef scratch)
 {
     scratch.Grow(input.Length);
     for (int i = 0; i < input.Length; i++)
     {
         scratch.Ints[i] = input.Bytes[i + input.Offset] & 0xFF;
     }
     scratch.Length = input.Length;
     return(scratch);
 }
 public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum <long?> fstEnum)
 {
     this.Entry       = entry;
     this.DocToOrd    = docToOrd;
     this.Fst         = fst;
     this.@in         = @in;
     this.FirstArc    = firstArc;
     this.ScratchArc  = scratchArc;
     this.ScratchInts = scratchInts;
     this.FstEnum     = fstEnum;
 }
Beispiel #23
0
        private static IntsRef ToIntsRef(string s)
        {
            var @ref     = new IntsRef(s.Length); // worst case
            int utf16Len = s.Length;

            for (int i = 0, cp = 0; i < utf16Len; i += Character.CharCount(cp))
            {
                cp = @ref.Ints[@ref.Length++] = Character.CodePointAt(s, i);
            }
            return(@ref);
        }
        private static IntsRef toIntsRef(string s)
        {
            IntsRef @ref     = new IntsRef(s.Length); // worst case
            int     utf16Len = s.Length;

            for (int i = 0, cp = 0; i < utf16Len; i += char.charCount(cp))
            {
                cp = @ref.ints[@ref.length++] = char.codePointAt(s, i);
            }
            return(@ref);
        }
Beispiel #25
0
 public SortedDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, NumericDocValues docToOrd, FST <long> fst, FST <long> .BytesReader @in, FST <long> .Arc <long> firstArc, FST <long> .Arc <long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum <long> fstEnum)
 {
     this.OuterInstance = outerInstance;
     this.Entry         = entry;
     this.DocToOrd      = docToOrd;
     this.Fst           = fst;
     this.@in           = @in;
     this.FirstArc      = firstArc;
     this.ScratchArc    = scratchArc;
     this.ScratchInts   = scratchInts;
     this.FstEnum       = fstEnum;
 }
Beispiel #26
0
        public virtual void TestFromInts()
        {
            int[] ints = new int[] { 1, 2, 3, 4 };
            IntsRef i = new IntsRef(ints, 0, 4);
            Assert.AreEqual(ints, i.Ints);
            Assert.AreEqual(0, i.Offset);
            Assert.AreEqual(4, i.Length);

            IntsRef i2 = new IntsRef(ints, 1, 3);
            Assert.AreEqual(new IntsRef(new int[] { 2, 3, 4 }, 0, 3), i2);

            Assert.IsFalse(i.Equals(i2));
        }
Beispiel #27
0
        private static BytesRef ToBytesRef(IntsRef ir)
        {
            BytesRef br = new BytesRef(ir.Length);

            for (int i = 0; i < ir.Length; i++)
            {
                int x = ir.Ints[ir.Offset + i];
                Debug.Assert(x >= 0 && x <= 255);
                br.Bytes[i] = (sbyte)x;
            }
            br.Length = ir.Length;
            return(br);
        }
 public virtual void TestFiniteStrings()
 {
     Automaton a = BasicOperations.Union(BasicAutomata.MakeString("dog"), BasicAutomata.MakeString("duck"));
     MinimizationOperations.Minimize(a);
     ISet<IntsRef> strings = SpecialOperations.GetFiniteStrings(a, -1);
     Assert.AreEqual(2, strings.Count);
     IntsRef dog = new IntsRef();
     Util.ToIntsRef(new BytesRef("dog"), dog);
     Assert.IsTrue(strings.Contains(dog));
     IntsRef duck = new IntsRef();
     Util.ToIntsRef(new BytesRef("duck"), duck);
     Assert.IsTrue(strings.Contains(duck));
 }
Beispiel #29
0
 internal static IntsRef ToIntsRef(string s, int inputMode, IntsRef ir)
 {
     if (inputMode == 0)
     {
         // utf8
         return(ToIntsRef(new BytesRef(s), ir));
     }
     else
     {
         // utf32
         return(ToIntsRefUTF32(s, ir));
     }
 }
Beispiel #30
0
        /// <summary>
        /// Reverse lookup (lookup by output instead of by input),
        ///  in the special case when your FSTs outputs are
        ///  strictly ascending.  this locates the input/output
        ///  pair where the output is equal to the target, and will
        ///  return null if that output does not exist.
        ///
        ///  <p>NOTE: this only works with {@code FST<Long>}, only
        ///  works when the outputs are ascending in order with
        ///  the inputs.
        ///  For example, simple ordinals (0, 1,
        ///  2, ...), or file offets (when appending to a file)
        ///  fit this.
        /// </summary>
        public static IntsRef GetByOutput(FST <long?> fst, long targetOutput)
        {
            var @in = fst.BytesReader;

            // TODO: would be nice not to alloc this on every lookup
            FST <long?> .Arc <long?> arc = fst.GetFirstArc(new FST <long?> .Arc <long?>());

            FST <long?> .Arc <long?> scratchArc = new FST <long?> .Arc <long?>();

            IntsRef result = new IntsRef();

            return(GetByOutput(fst, targetOutput, @in, arc, scratchArc, result));
        }
Beispiel #31
0
 /// <summary>
 /// Just converts IntsRef to BytesRef; you must ensure the
 ///  int values fit into a byte.
 /// </summary>
 public static BytesRef ToBytesRef(IntsRef input, BytesRef scratch)
 {
     scratch.Grow(input.Length);
     for (int i = 0; i < input.Length; i++)
     {
         int value = input.Ints[i + input.Offset];
         // NOTE: we allow -128 to 255
         Debug.Assert(value >= sbyte.MinValue && value <= 255, "value " + value + " doesn't fit into byte");
         scratch.Bytes[i] = (byte)value;
     }
     scratch.Length = input.Length;
     return(scratch);
 }
Beispiel #32
0
        /// <summary>
        /// Just maps each UTF16 unit (char) to the ints in an
        ///  IntsRef.
        /// </summary>
        public static IntsRef ToUTF16(string s, IntsRef scratch)
        {
            int charLimit = s.Length;

            scratch.Offset = 0;
            scratch.Length = charLimit;
            scratch.Grow(charLimit);
            for (int idx = 0; idx < charLimit; idx++)
            {
                scratch.Ints[idx] = (int)s[idx];
            }
            return(scratch);
        }
Beispiel #33
0
 internal static IntsRef ToIntsRef(BytesRef br, IntsRef ir)
 {
     if (br.Length > ir.Ints.Length)
     {
         ir.Grow(br.Length);
     }
     for (int i = 0; i < br.Length; i++)
     {
         ir.Ints[i] = br.Bytes[br.Offset + i] & 0xFF;
     }
     ir.Length = br.Length;
     return(ir);
 }
        private void Count(IList<FacetsCollector.MatchingDocs> matchingDocs)
        {
            IntsRef scratch = new IntsRef();
            foreach (FacetsCollector.MatchingDocs hits in matchingDocs)
            {
                OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.GetReader(hits.context);
                DocIdSetIterator docs = hits.bits.GetIterator();

                int doc;
                while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
                {
                    ords.Get(doc, scratch);
                    for (int i = 0; i < scratch.Length; i++)
                    {
                        values[scratch.Ints[scratch.Offset + i]]++;
                    }
                }
            }

            Rollup();
        }
 public SortedDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry,
     NumericDocValues numericDocValues, FST<long?> fst1, FST.BytesReader @in, FST.Arc<long?> arc, FST.Arc<long?> scratchArc1,
     IntsRef intsRef, BytesRefFSTEnum<long?> bytesRefFstEnum)
 {
     entry = fstEntry;
     docToOrd = numericDocValues;
     fst = fst1;
     this.@in = @in;
     firstArc = arc;
     scratchArc = scratchArc1;
     scratchInts = intsRef;
     fstEnum = bytesRefFstEnum;
 }
        /// <summary>
        /// Builds the final automaton from a list of entries.
        /// </summary>
        private FST<object> BuildAutomaton(BytesRefSorter sorter)
        {
            // Build the automaton.
            Outputs<object> outputs = NoOutputs.Singleton;
            object empty = outputs.NoOutput;
            Builder<object> builder = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInts.DEFAULT, true, 15);

            BytesRef scratch = new BytesRef();
            BytesRef entry;
            IntsRef scratchIntsRef = new IntsRef();
            int count = 0;
            BytesRefIterator iter = sorter.GetEnumerator();
            while ((entry = iter.Next()) != null)
            {
                count++;
                if (scratch.CompareTo(entry) != 0)
                {
                    builder.Add(Util.Fst.Util.ToIntsRef(entry, scratchIntsRef), empty);
                    scratch.CopyBytes(entry);
                }
            }

            return count == 0 ? null : builder.Finish();
        }
        public override SortedSetDocValues GetSortedSet(FieldInfo field)
        {
            var entry = fsts[field.Number];
            if (entry.numOrds == 0)
            {
                return DocValues.EMPTY_SORTED_SET; // empty FST!
            }
            FST<long?> instance;
            lock (this)
            {
                instance = fstInstances[field.Number];
                if (instance == null)
                {
                    data.Seek(entry.offset);
                    instance = new FST<long?>(data, PositiveIntOutputs.Singleton);
                    ramBytesUsed.AddAndGet(instance.SizeInBytes());
                    fstInstances[field.Number] = instance;
                }
            }
            var docToOrds = GetBinary(field);
            var fst = instance;

            // per-thread resources
            var @in = fst.BytesReader;
            var firstArc = new FST.Arc<long?>();
            var scratchArc = new FST.Arc<long?>();
            var scratchInts = new IntsRef();
            var fstEnum = new BytesRefFSTEnum<long?>(fst);
            var @ref = new BytesRef();
            var input = new ByteArrayDataInput();
            return new SortedSetDocValuesAnonymousInnerClassHelper(entry, docToOrds, fst, @in, firstArc,
                scratchArc, scratchInts, fstEnum, @ref, input);
        }
Beispiel #38
0
		/// <summary>
		/// Builds an <seealso cref="SynonymMap"/> and returns it.
		/// </summary>
		public virtual SynonymMap Build()
		{
		  ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
		  // TODO: are we using the best sharing options?
		  var builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

		  BytesRef scratch = new BytesRef(64);
		  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

		  HashSet<int?> dedupSet;

		  if (dedup)
		  {
			dedupSet = new HashSet<int?>();
		  }
		  else
		  {
			dedupSet = null;
		  }

		  
            var spare = new sbyte[5];

		  Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys;
		  CharsRef[] sortedKeys = keys.ToArray();
		  Arrays.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator);

		  IntsRef scratchIntsRef = new IntsRef();

		  //System.out.println("fmap.build");
		  for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
		  {
			CharsRef input = sortedKeys[keyIdx];
			MapEntry output = workingSet[input];

			int numEntries = output.ords.Count;
			// output size, assume the worst case
			int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

			scratch.Grow(estimatedSize);
			scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length);
			Debug.Assert(scratch.Offset == 0);

			// now write our output data:
			int count = 0;
			for (int i = 0; i < numEntries; i++)
			{
			  if (dedupSet != null)
			  {
				// box once
				int? ent = output.ords[i];
				if (dedupSet.Contains(ent))
				{
				  continue;
				}
				dedupSet.Add(ent);
			  }
			  scratchOutput.WriteVInt(output.ords[i]);
			  count++;
			}

			int pos = scratchOutput.Position;
			scratchOutput.WriteVInt(count << 1 | (output.includeOrig ? 0 : 1));
			int pos2 = scratchOutput.Position;
			int vIntLen = pos2 - pos;

			// Move the count + includeOrig to the front of the byte[]:
			Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen);
			Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos);
			Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen);

			if (dedupSet != null)
			{
			  dedupSet.Clear();
			}

			scratch.Length = scratchOutput.Position - scratch.Offset;
			//System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
			builder.Add(Util.ToUTF32(input, scratchIntsRef), BytesRef.DeepCopyOf(scratch));
		  }

		  FST<BytesRef> fst = builder.Finish();
		  return new SynonymMap(fst, words, maxHorizontalContext);
		}
            private void LoadTermsIndex()
            {
                if (Fst != null) return;

                var clone = (IndexInput) _vgtir._input.Clone();
                clone.Seek(_indexStart);
                Fst = new FST<long?>(clone, _vgtir._fstOutputs);
                clone.Dispose();

                /*
                final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
                Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
                Util.toDot(fst, w, false, false);
                System.out.println("FST INDEX: SAVED to " + dotFileName);
                w.close();
                */

                if (_vgtir._indexDivisor > 1)
                {
                    // subsample
                    var scratchIntsRef = new IntsRef();
                    var outputs = PositiveIntOutputs.Singleton;
                    var builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs);
                    var fstEnum = new BytesRefFSTEnum<long?>(Fst);
                    var count = _vgtir._indexDivisor;
                        
                    BytesRefFSTEnum<long?>.InputOutput<long?> result;
                    while ((result = fstEnum.Next()) != null)
                    {
                        if (count == _vgtir._indexDivisor)
                        {
                            builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef), result.Output);
                            count = 0;
                        }
                        count++;
                    }
                    Fst = builder.Finish();
                }
            }
 /// <summary>
 /// Returns the strings that can be produced from the given state, or
 /// false if more than <code>limit</code> strings are found.
 /// <code>limit</code>&lt;0 means "infinite".
 /// </summary>
 private static bool GetFiniteStrings(State s, HashSet<State> pathstates, HashSet<IntsRef> strings, IntsRef path, int limit)
 {
     pathstates.Add(s);
     foreach (Transition t in s.Transitions)
     {
         if (pathstates.Contains(t.To))
         {
             return false;
         }
         for (int n = t.Min_Renamed; n <= t.Max_Renamed; n++)
         {
             path.Grow(path.Length + 1);
             path.Ints[path.Length] = n;
             path.Length++;
             if (t.To.accept)
             {
                 strings.Add(IntsRef.DeepCopyOf(path));
                 if (limit >= 0 && strings.Count > limit)
                 {
                     return false;
                 }
             }
             if (!GetFiniteStrings(t.To, pathstates, strings, path, limit))
             {
                 return false;
             }
             path.Length--;
         }
     }
     pathstates.Remove(s);
     return true;
 }
 public override void Get(int docID, IntsRef ordinals)
 {
     ordinals.Ints = cachedOrds.ordinals;
     ordinals.Offset = cachedOrds.offsets[docID];
     ordinals.Length = cachedOrds.offsets[docID + 1] - ordinals.Offset;
 }
		/// <summary>
		/// Builds the NormalizeCharMap; call this once you
		///  are done calling <seealso cref="#add"/>. 
		/// </summary>
		public virtual NormalizeCharMap build()
		{

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.FST<org.apache.lucene.util.CharsRef> map;
		  FST<CharsRef> map;
		  try
		  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.Outputs<org.apache.lucene.util.CharsRef> outputs = org.apache.lucene.util.fst.CharSequenceOutputs.getSingleton();
			Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.Builder<org.apache.lucene.util.CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(org.apache.lucene.util.fst.FST.INPUT_TYPE.BYTE2, outputs);
			Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratch = new org.apache.lucene.util.IntsRef();
			IntsRef scratch = new IntsRef();
			foreach (KeyValuePair<string, string> ent in pendingPairs.SetOfKeyValuePairs())
			{
			  builder.add(Util.toUTF16(ent.Key, scratch), new CharsRef(ent.Value));
			}
			map = builder.finish();
			pendingPairs.Clear();
		  }
		  catch (IOException ioe)
		  {
			// Bogus FST IOExceptions!!  (will never happen)
			throw new Exception(ioe);
		  }

		  return new NormalizeCharMap(map);
		}
 public SortedSetDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, BinaryDocValues docToOrds, FST<long> fst, FST<long>.BytesReader @in, FST<long>.Arc<long> firstArc, FST<long>.Arc<long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long> fstEnum, BytesRef @ref, ByteArrayDataInput input)
 {
     this.OuterInstance = outerInstance;
     this.Entry = entry;
     this.DocToOrds = docToOrds;
     this.Fst = fst;
     this.@in = @in;
     this.FirstArc = firstArc;
     this.ScratchArc = scratchArc;
     this.ScratchInts = scratchInts;
     this.FstEnum = fstEnum;
     this.@ref = @ref;
     this.Input = input;
 }
        public override SortedSetDocValues GetSortedSet(FieldInfo field)
        {
            FSTEntry entry = Fsts[field.Number];
            if (entry.NumOrds == 0)
            {
                return DocValues.EMPTY_SORTED_SET; // empty FST!
            }
            FST<long> instance;
            lock (this)
            {
                if (!FstInstances.TryGetValue(field.Number, out instance))
                {
                    Data.Seek(entry.Offset);
                    instance = new FST<long>((DataInput)Data, Lucene.Net.Util.Fst.PositiveIntOutputs.Singleton);
                    RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes());
                    FstInstances[field.Number] = instance;
                }
            }
            BinaryDocValues docToOrds = GetBinary(field);
            FST<long> fst = instance;

            // per-thread resources
            FST<long>.BytesReader @in = fst.BytesReader;
            FST<long>.Arc<long> firstArc = new FST<long>.Arc<long>();
            FST<long>.Arc<long> scratchArc = new FST<long>.Arc<long>();
            IntsRef scratchInts = new IntsRef();
            BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst);
            BytesRef @ref = new BytesRef();
            ByteArrayDataInput input = new ByteArrayDataInput();
            return new SortedSetDocValuesAnonymousInnerClassHelper(this, entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input);
        }
		/// <summary>
		/// Returns an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </summary>
		/// <returns> an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </returns>
		/// <exception cref="IOException"> if an <seealso cref="IOException"/> occurs; </exception>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public StemmerOverrideMap build() throws java.io.IOException
		public virtual StemmerOverrideMap build()
		{
		  ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
		  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int[] sort = hash.sort(org.apache.lucene.util.BytesRef.getUTF8SortedAsUnicodeComparator());
		  int[] sort = hash.sort(BytesRef.UTF8SortedAsUnicodeComparator);
		  IntsRef intsSpare = new IntsRef();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int size = hash.size();
		  int size = hash.size();
		  for (int i = 0; i < size; i++)
		  {
			int id = sort[i];
			BytesRef bytesRef = hash.get(id, spare);
			UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
			builder.add(intsSpare, new BytesRef(outputValues[id]));
		  }
		  return new StemmerOverrideMap(builder.finish(), ignoreCase);
		}
 /// <summary>
 /// Returns an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </summary>
 /// <returns> an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </returns>
 /// <exception cref="IOException"> if an <seealso cref="IOException"/> occurs; </exception>
 public virtual StemmerOverrideMap Build()
 {
     ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
     Lucene.Net.Util.Fst.Builder<BytesRef> builder = new Lucene.Net.Util.Fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
     int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
     IntsRef intsSpare = new IntsRef();
     int size = hash.Size();
     for (int i = 0; i < size; i++)
     {
         int id = sort[i];
         BytesRef bytesRef = hash.Get(id, spare);
         UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
         builder.Add(intsSpare, new BytesRef(outputValues[id]));
     }
     return new StemmerOverrideMap(builder.Finish(), ignoreCase);
 }
 private static IntsRef ToIntsRef(string s)
 {
     var @ref = new IntsRef(s.Length); // worst case
     int utf16Len = s.Length;
     for (int i = 0, cp = 0; i < utf16Len; i += Character.CharCount(cp))
     {
         cp = @ref.Ints[@ref.Length++] = Character.CodePointAt(s, i);
     }
     return @ref;
 }
 public SortedDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, NumericDocValues docToOrd, FST<long> fst, FST<long>.BytesReader @in, FST<long>.Arc<long> firstArc, FST<long>.Arc<long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long> fstEnum)
 {
     this.OuterInstance = outerInstance;
     this.Entry = entry;
     this.DocToOrd = docToOrd;
     this.Fst = fst;
     this.@in = @in;
     this.FirstArc = firstArc;
     this.ScratchArc = scratchArc;
     this.ScratchInts = scratchInts;
     this.FstEnum = fstEnum;
 }
Beispiel #49
0
		/// <summary>
		/// Builds an <seealso cref="SynonymMap"/> and returns it.
		/// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public SynonymMap build() throws java.io.IOException
		public virtual SynonymMap build()
		{
		  ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
		  // TODO: are we using the best sharing options?
		  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

		  BytesRef scratch = new BytesRef(64);
		  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final java.util.Set<Integer> dedupSet;
		  HashSet<int?> dedupSet;

		  if (dedup)
		  {
			dedupSet = new HashSet<>();
		  }
		  else
		  {
			dedupSet = null;
		  }

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final byte[] spare = new byte[5];
		  sbyte[] spare = new sbyte[5];

		  Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys;
		  CharsRef[] sortedKeys = keys.toArray(new CharsRef[keys.size()]);
		  Arrays.sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator);

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratchIntsRef = new org.apache.lucene.util.IntsRef();
		  IntsRef scratchIntsRef = new IntsRef();

		  //System.out.println("fmap.build");
		  for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
		  {
			CharsRef input = sortedKeys[keyIdx];
			MapEntry output = workingSet[input];

			int numEntries = output.ords.Count;
			// output size, assume the worst case
			int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

			scratch.grow(estimatedSize);
			scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
			Debug.Assert(scratch.offset == 0);

			// now write our output data:
			int count = 0;
			for (int i = 0; i < numEntries; i++)
			{
			  if (dedupSet != null)
			  {
				// box once
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final Integer ent = output.ords.get(i);
				int? ent = output.ords[i];
				if (dedupSet.Contains(ent))
				{
				  continue;
				}
				dedupSet.Add(ent);
			  }
			  scratchOutput.writeVInt(output.ords[i]);
			  count++;
			}

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int pos = scratchOutput.getPosition();
			int pos = scratchOutput.Position;
			scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int pos2 = scratchOutput.getPosition();
			int pos2 = scratchOutput.Position;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int vIntLen = pos2-pos;
			int vIntLen = pos2 - pos;

			// Move the count + includeOrig to the front of the byte[]:
			Array.Copy(scratch.bytes, pos, spare, 0, vIntLen);
			Array.Copy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
			Array.Copy(spare, 0, scratch.bytes, 0, vIntLen);

			if (dedupSet != null)
			{
			  dedupSet.Clear();
			}

			scratch.length = scratchOutput.Position - scratch.offset;
			//System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
			builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
		  }

		  FST<BytesRef> fst = builder.finish();
		  return new SynonymMap(fst, words, maxHorizontalContext);
		}
        public virtual void TestUTF8toUTF32()
        {
            BytesRef utf8 = new BytesRef(20);
            IntsRef utf32 = new IntsRef(20);
            int[] codePoints = new int[20];
            int num = AtLeast(50000);
            for (int i = 0; i < num; i++)
            {
                string s = TestUtil.RandomUnicodeString(Random());
                UnicodeUtil.UTF16toUTF8(s.ToCharArray(), 0, s.Length, utf8);
                UnicodeUtil.UTF8toUTF32(utf8, utf32);

                int charUpto = 0;
                int intUpto = 0;

                while (charUpto < s.Length)
                {
                    int cp = Character.CodePointAt(s, charUpto);
                    codePoints[intUpto++] = cp;
                    charUpto += Character.CharCount(cp);
                }
                if (!ArrayUtil.Equals(codePoints, 0, utf32.Ints, utf32.Offset, intUpto))
                {
                    Console.WriteLine("FAILED");
                    for (int j = 0; j < s.Length; j++)
                    {
                        Console.WriteLine("  char[" + j + "]=" + ((int)s[j]).ToString("x"));
                    }
                    Console.WriteLine();
                    Assert.AreEqual(intUpto, utf32.Length);
                    for (int j = 0; j < intUpto; j++)
                    {
                        Console.WriteLine("  " + utf32.Ints[j].ToString("x") + " vs " + codePoints[j].ToString("x"));
                    }
                    Assert.Fail("mismatch");
                }
            }
        }
            /// <summary>
            /// Builds the NormalizeCharMap; call this once you
            ///  are done calling <seealso cref="#add"/>. 
            /// </summary>
            public virtual NormalizeCharMap Build()
            {

                FST<CharsRef> map;
                try
                {
                    Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
                    Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
                    IntsRef scratch = new IntsRef();
                    foreach (var ent in pendingPairs)
                    {
                        builder.Add(Lucene.Net.Util.Fst.Util.ToUTF16(ent.Key, scratch), new CharsRef(ent.Value));
                    }
                    map = builder.Finish();
                    pendingPairs.Clear();
                }
                catch (IOException ioe)
                {
                    // Bogus FST IOExceptions!!  (will never happen)
                    throw new Exception("Should never happen", ioe);
                }

                return new NormalizeCharMap(map);
            }
 public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, BinaryDocValues binaryDocValues, FST<long?> fst1,
     FST.BytesReader @in, FST.Arc<long?> arc, FST.Arc<long?> scratchArc1, IntsRef intsRef, BytesRefFSTEnum<long?> bytesRefFstEnum,
     BytesRef @ref, ByteArrayDataInput byteArrayDataInput)
 {
     entry = fstEntry;
     docToOrds = binaryDocValues;
     fst = fst1;
     this.@in = @in;
     firstArc = arc;
     scratchArc = scratchArc1;
     scratchInts = intsRef;
     fstEnum = bytesRefFstEnum;
     this.@ref = @ref;
     input = byteArrayDataInput;
 }
Beispiel #53
0
        public virtual void Test()
        {
            int[] ints = new int[7];
            IntsRef input = new IntsRef(ints, 0, ints.Length);
            int seed = Random().Next();

            Directory dir = new MMapDirectory(CreateTempDir("2BFST"));

            for (int doPackIter = 0; doPackIter < 2; doPackIter++)
            {
                bool doPack = doPackIter == 1;

                // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
                if (!doPack)
                {
                    Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
                    Outputs<object> outputs = NoOutputs.Singleton;
                    object NO_OUTPUT = outputs.NoOutput;
                    Builder<object> b = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15);

                    int count = 0;
                    Random r = new Random(seed);
                    int[] ints2 = new int[200];
                    IntsRef input2 = new IntsRef(ints2, 0, ints2.Length);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        for (int i = 10; i < ints2.Length; i++)
                        {
                            ints2[i] = r.Next(256);
                        }
                        b.Add(input2, NO_OUTPUT);
                        count++;
                        if (count % 100000 == 0)
                        {
                            Console.WriteLine(count + ": " + b.FstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes");
                        }
                        if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024)
                        {
                            break;
                        }
                        NextInput(r, ints2);
                    }

                    FST<object> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2));
                            NextInput(r, ints2);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        IntsRefFSTEnum<object> fstEnum = new IntsRefFSTEnum<object>(fst);

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            IntsRefFSTEnum<object>.InputOutput<object> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(input2, pair.Input);
                            Assert.AreEqual(NO_OUTPUT, pair.Output);
                            upto++;
                            NextInput(r, ints2);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST<object>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ ByteSequenceOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes");
                    Outputs<BytesRef> outputs = ByteSequenceOutputs.Singleton;
                    Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15);

                    var outputBytes = new byte[20];
                    BytesRef output = new BytesRef(outputBytes);
                    Arrays.Fill(ints, 0);
                    int count = 0;
                    Random r = new Random(seed);
                    while (true)
                    {
                        r.NextBytes(outputBytes);
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, BytesRef.DeepCopyOf(output));
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes");
                        }
                        if (b.FstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST<BytesRef> fst = b.Finish();
                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        r = new Random(seed);
                        Arrays.Fill(ints, 0);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            r.NextBytes((byte[])(Array)outputBytes);
                            Assert.AreEqual(output, Util.Get(fst, input));
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            IntsRefFSTEnum<BytesRef>.InputOutput<BytesRef> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            r.NextBytes((byte[])(Array)outputBytes);
                            Assert.AreEqual(output, pair.Output);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST<BytesRef>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ PositiveIntOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long");
                    Outputs<long?> outputs = PositiveIntOutputs.Singleton;
                    Builder<long?> b = new Builder<long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15);

                    long output = 1;

                    Arrays.Fill(ints, 0);
                    int count = 0;
                    Random r = new Random(seed);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, output);
                        output += 1 + r.Next(10);
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes");
                        }
                        if (b.FstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST<long?> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints, 0);

                        output = 1;
                        r = new Random(seed);
                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }

                            // forward lookup:
                            Assert.AreEqual(output, (long)Util.Get(fst, input));
                            // reverse lookup:
                            Assert.AreEqual(input, Util.GetByOutput(fst, output));
                            output += 1 + r.Next(10);
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        IntsRefFSTEnum<long?> fstEnum = new IntsRefFSTEnum<long?>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        output = 1;
                        while (true)
                        {
                            IntsRefFSTEnum<long?>.InputOutput<long?> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            Assert.AreEqual(output, pair.Output.Value);
                            output += 1 + r.Next(10);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST<long?>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }
            }
            dir.Dispose();
        }
Beispiel #54
0
        /// <summary>
        /// <p>this method assumes valid UTF8 input. this method
        /// <strong>does not perform</strong> full UTF8 validation, it will check only the
        /// first byte of each codepoint (for multi-byte sequences any bytes after
        /// the head are skipped).
        /// </summary>
        /// <exception cref="IllegalArgumentException"> If invalid codepoint header byte occurs or the
        ///    content is prematurely truncated. </exception>
        public static void UTF8toUTF32(BytesRef utf8, IntsRef utf32)
        {
            // TODO: broken if incoming result.offset != 0
            // pre-alloc for worst case
            // TODO: ints cannot be null, should be an assert
            if (utf32.Ints == null || utf32.Ints.Length < utf8.Length)
            {
                utf32.Ints = new int[utf8.Length];
            }
            int utf32Count = 0;
            int utf8Upto = utf8.Offset;
            int[] ints = utf32.Ints;
            var bytes = utf8.Bytes;
            int utf8Limit = utf8.Offset + utf8.Length;
            while (utf8Upto < utf8Limit)
            {
                int numBytes = Utf8CodeLength[bytes[utf8Upto] & 0xFF];
                int v = 0;
                switch (numBytes)
                {
                    case 1:
                        ints[utf32Count++] = bytes[utf8Upto++];
                        continue;
                    case 2:
                        // 5 useful bits
                        v = bytes[utf8Upto++] & 31;
                        break;

                    case 3:
                        // 4 useful bits
                        v = bytes[utf8Upto++] & 15;
                        break;

                    case 4:
                        // 3 useful bits
                        v = bytes[utf8Upto++] & 7;
                        break;

                    default:
                        throw new System.ArgumentException("invalid utf8");
                }

                // TODO: this may read past utf8's limit.
                int limit = utf8Upto + numBytes - 1;
                while (utf8Upto < limit)
                {
                    v = v << 6 | bytes[utf8Upto++] & 63;
                }
                ints[utf32Count++] = v;
            }

            utf32.Offset = 0;
            utf32.Length = utf32Count;
        }
            /// <summary>
            /// Creates a new <seealso cref="CachedOrds"/> from the <seealso cref="BinaryDocValues"/>.
            /// Assumes that the <seealso cref="BinaryDocValues"/> is not {@code null}.
            /// </summary>
            public CachedOrds(OrdinalsSegmentReader source, int maxDoc)
            {
                offsets = new int[maxDoc + 1];
                int[] ords = new int[maxDoc]; // let's assume one ordinal per-document as an initial size

                // this aggregator is limited to Integer.MAX_VALUE total ordinals.
                long totOrds = 0;
                IntsRef values = new IntsRef(32);
                for (int docID = 0; docID < maxDoc; docID++)
                {
                    offsets[docID] = (int)totOrds;
                    source.Get(docID, values);
                    long nextLength = totOrds + values.Length;
                    if (nextLength > ords.Length)
                    {
                        if (nextLength > ArrayUtil.MAX_ARRAY_LENGTH)
                        {
                            throw new ThreadStateException("too many ordinals (>= " + nextLength + ") to cache");
                        }
                        ords = ArrayUtil.Grow(ords, (int)nextLength);
                    }
                    Array.Copy(values.Ints, 0, ords, (int)totOrds, values.Length);
                    totOrds = nextLength;
                }
                offsets[maxDoc] = (int)totOrds;

                // if ords array is bigger by more than 10% of what we really need, shrink it
                if ((double)totOrds / ords.Length < 0.9)
                {
                    this.ordinals = new int[(int)totOrds];
                    Array.Copy(ords, 0, this.ordinals, 0, (int)totOrds);
                }
                else
                {
                    this.ordinals = ords;
                }
            }
        public override SortedDocValues GetSorted(FieldInfo field)
        {
            FSTEntry entry = Fsts[field.Number];
            FST<long> instance;
            lock (this)
            {
                if (!FstInstances.TryGetValue(field.Number, out instance))
                {
                    Data.Seek(entry.Offset);
                    instance = new FST<long>(Data, PositiveIntOutputs.Singleton);
                    RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes());
                    FstInstances[field.Number] = instance;
                }
            }
            NumericDocValues docToOrd = GetNumeric(field);
            FST<long> fst = instance;

            // per-thread resources
            FST<long>.BytesReader @in = fst.BytesReader;
            FST<long>.Arc<long> firstArc = new FST<long>.Arc<long>();
            FST<long>.Arc<long> scratchArc = new FST<long>.Arc<long>();
            IntsRef scratchInts = new IntsRef();
            BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst);

            return new SortedDocValuesAnonymousInnerClassHelper(this, entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum);
        }
        private void SumValues(IList<MatchingDocs> matchingDocs, bool keepScores, ValueSource valueSource)
        {
            FakeScorer scorer = new FakeScorer();
            IDictionary context = new Dictionary<string, Scorer>();
            if (keepScores)
            {
                context["scorer"] = scorer;
            }
            IntsRef scratch = new IntsRef();
            foreach (MatchingDocs hits in matchingDocs)
            {
                OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.GetReader(hits.Context);

                int scoresIdx = 0;
                float[] scores = hits.Scores;

                FunctionValues functionValues = valueSource.GetValues(context, hits.Context);
                DocIdSetIterator docs = hits.Bits.GetIterator();

                int doc;
                while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
                {
                    ords.Get(doc, scratch);
                    if (keepScores)
                    {
                        scorer.docID_Renamed = doc;
                        scorer.score_Renamed = scores[scoresIdx++];
                    }
                    float value = (float)functionValues.DoubleVal(doc);
                    for (int i = 0; i < scratch.Length; i++)
                    {
                        values[scratch.Ints[i]] += value;
                    }
                }
            }

            Rollup();
        }
 public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST<long?> fst, FST<long?>.BytesReader @in, FST<long?>.Arc<long?> firstArc, FST<long?>.Arc<long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long?> fstEnum)
 {
     this.Entry = entry;
     this.DocToOrd = docToOrd;
     this.Fst = fst;
     this.@in = @in;
     this.FirstArc = firstArc;
     this.ScratchArc = scratchArc;
     this.ScratchInts = scratchInts;
     this.FstEnum = fstEnum;
 }
        public virtual void TestReplacements()
        {
            Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
            Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
            IntsRef scratchInts = new IntsRef();

            // a -> b
            Lucene.Net.Util.Fst.Util.ToUTF16("a", scratchInts);
            builder.Add(scratchInts, new CharsRef("b"));

            // ab -> c
            Lucene.Net.Util.Fst.Util.ToUTF16("ab", scratchInts);
            builder.Add(scratchInts, new CharsRef("c"));

            // c -> de
            Lucene.Net.Util.Fst.Util.ToUTF16("c", scratchInts);
            builder.Add(scratchInts, new CharsRef("de"));

            // def -> gh
            Lucene.Net.Util.Fst.Util.ToUTF16("def", scratchInts);
            builder.Add(scratchInts, new CharsRef("gh"));

            FST<CharsRef> fst = builder.Finish();

            StringBuilder sb = new StringBuilder("atestanother");
            Dictionary.ApplyMappings(fst, sb);
            assertEquals("btestbnother", sb.ToString());

            sb = new StringBuilder("abtestanother");
            Dictionary.ApplyMappings(fst, sb);
            assertEquals("ctestbnother", sb.ToString());

            sb = new StringBuilder("atestabnother");
            Dictionary.ApplyMappings(fst, sb);
            assertEquals("btestcnother", sb.ToString());

            sb = new StringBuilder("abtestabnother");
            Dictionary.ApplyMappings(fst, sb);
            assertEquals("ctestcnother", sb.ToString());

            sb = new StringBuilder("abtestabcnother");
            Dictionary.ApplyMappings(fst, sb);
            assertEquals("ctestcdenother", sb.ToString());

            sb = new StringBuilder("defdefdefc");
            Dictionary.ApplyMappings(fst, sb);
            assertEquals("ghghghde", sb.ToString());
        }
            private void loadTermsIndex()
            {
                if (Fst == null)
                {
                    IndexInput clone = input.Clone();
                    clone.Seek(indexStart);
                    Fst = new FST<>(clone, fstOutputs);
                    clone.Close();

                    /*
        final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
        Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
        Util.toDot(fst, w, false, false);
        System.out.println("FST INDEX: SAVED to " + dotFileName);
        w.close();
        */

                    if (indexDivisor > 1)
                    {
                        // subsample
                        IntsRef scratchIntsRef = new IntsRef();
                        PositiveIntOutputs outputs = PositiveIntOutputs.GetSingleton();
                        Builder<long> builder = new Builder<long>(FST.INPUT_TYPE.BYTE1, outputs);
                        BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst);
                        BytesRefFSTEnum.InputOutput<long> result;
                        int count = indexDivisor;
                        while ((result = fstEnum.Next()) != null)
                        {
                            if (count == indexDivisor)
                            {
                                builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef), result.Output);
                                count = 0;
                            }
                            count++;
                        }
                        Fst = builder.Finish();
                    }
                }
            }