/// <summary> /// Find the stem(s) of the provided word /// </summary> /// <param name="word"> Word to find the stems for </param> /// <returns> List of stems for the word </returns> public IList <CharsRef> Stem(char[] word, int length) { if (dictionary.needsInputCleaning) { scratchSegment.Length = 0; scratchSegment.Append(word, 0, length); string cleaned = dictionary.CleanInput(scratchSegment.ToString(), segment); scratchBuffer = ArrayUtil.Grow(scratchBuffer, cleaned.Length); length = segment.Length; segment.CopyTo(0, scratchBuffer, 0, length); word = scratchBuffer; } List <CharsRef> stems = new List <CharsRef>(); IntsRef forms = dictionary.LookupWord(word, 0, length); if (forms != null) { // TODO: some forms should not be added, e.g. ONLYINCOMPOUND // just because it exists, does not make it valid... for (int i = 0; i < forms.Length; i++) { stems.Add(NewStem(word, length)); } } stems.AddRange(Stem(word, length, -1, -1, -1, 0, true, true, false, false)); return(stems); }
protected override bool AcceptResult(IntsRef input, PairOutputs <long?, BytesRef> .Pair output) { // Dedup: when the input analyzes to a graph we // can get duplicate surface forms: if (seen.Contains(output.Output2)) { return(false); } seen.Add(output.Output2); if (!outerInstance.exactFirst) { return(true); } else { // In exactFirst mode, don't accept any paths // matching the surface form since that will // create duplicate results: if (outerInstance.SameSurfaceForm(utf8Key, output.Output2)) { // We found exact match, which means we should // have already found it in the first search: Debug.Assert(results.Count == 1); return(false); } else { return(true); } } }
/// <summary> /// Builds the NormalizeCharMap; call this once you /// are done calling <seealso cref="#add"/>. /// </summary> public virtual NormalizeCharMap build() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.fst.FST<org.apache.lucene.util.CharsRef> map; FST <CharsRef> map; try { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.fst.Outputs<org.apache.lucene.util.CharsRef> outputs = org.apache.lucene.util.fst.CharSequenceOutputs.getSingleton(); Outputs <CharsRef> outputs = CharSequenceOutputs.Singleton; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.fst.Builder<org.apache.lucene.util.CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(org.apache.lucene.util.fst.FST.INPUT_TYPE.BYTE2, outputs); Builder <CharsRef> builder = new Builder <CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratch = new org.apache.lucene.util.IntsRef(); IntsRef scratch = new IntsRef(); foreach (KeyValuePair <string, string> ent in pendingPairs.SetOfKeyValuePairs()) { builder.add(Util.toUTF16(ent.Key, scratch), new CharsRef(ent.Value)); } map = builder.finish(); pendingPairs.Clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new Exception(ioe); } return(new NormalizeCharMap(map)); }
public void TestListOfOutputs() { PositiveIntOutputs _outputs = PositiveIntOutputs.Singleton; ListOfOutputs <long?> outputs = new ListOfOutputs <long?>(_outputs); Builder <object> builder = new Builder <object>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE1, outputs); IntsRef scratch = new IntsRef(); // Add the same input more than once and the outputs // are merged: builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 1L); builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 3L); builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 0L); builder.Add(Util.ToIntsRef(new BytesRef("b"), scratch), 17L); FST <object> fst = builder.Finish(); object output = Util.Get(fst, new BytesRef("a")); assertNotNull(output); IList <long?> outputList = outputs.AsList(output); assertEquals(3, outputList.size()); assertEquals(1L, outputList[0]); assertEquals(3L, outputList[1]); assertEquals(0L, outputList[2]); output = Util.Get(fst, new BytesRef("b")); assertNotNull(output); outputList = outputs.AsList(output); assertEquals(1, outputList.size()); assertEquals(17L, outputList[0]); }
private void TestRandomWords(int maxNumWords, int numIter) { Random random = new Random(Random().Next()); for (int iter = 0; iter < numIter; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST: iter " + iter); } for (int inputMode = 0; inputMode < 2; inputMode++) { int numWords = random.nextInt(maxNumWords + 1); ISet <IntsRef> termsSet = new HashSet <IntsRef>(); IntsRef[] terms = new IntsRef[numWords]; while (termsSet.size() < numWords) { string term = FSTTester <object> .GetRandomString(random); termsSet.Add(FSTTester <object> .ToIntsRef(term, inputMode)); } DoTest(inputMode, termsSet.ToArray()); } } }
/// <summary> /// Seeks to biggest term that's <= target. </summary> public IntsRefFSTEnum.InputOutput <T> SeekFloor(IntsRef target) { this.target = target; targetLength = target.Length; base.DoSeekFloor(); return(SetResult()); }
/// <summary> /// Find the stem(s) of the provided word /// </summary> /// <param name="word"> Word to find the stems for </param> /// <returns> List of stems for the word </returns> public IList<CharsRef> stem(char[] word, int length) { if (dictionary.needsInputCleaning) { scratchSegment.Length = 0; scratchSegment.Append(word, 0, length); CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment); scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length()); length = segment.Length; segment.getChars(0, length, scratchBuffer, 0); word = scratchBuffer; } IList<CharsRef> stems = new List<CharsRef>(); IntsRef forms = dictionary.lookupWord(word, 0, length); if (forms != null) { // TODO: some forms should not be added, e.g. ONLYINCOMPOUND // just because it exists, does not make it valid... for (int i = 0; i < forms.length; i++) { stems.Add(newStem(word, length)); } } stems.AddRange(stem(word, length, -1, -1, -1, 0, true, true, false, false)); return stems; }
public override SortedDocValues GetSorted(FieldInfo field) { FSTEntry entry = fsts[field.Number]; if (entry.numOrds == 0) { return(DocValues.EMPTY_SORTED); } FST <long?> instance; lock (this) { instance = fstInstances[field.Number]; if (instance == null) { data.Seek(entry.offset); instance = new FST <long?>(data, PositiveIntOutputs.Singleton); ramBytesUsed.AddAndGet(instance.SizeInBytes()); fstInstances[field.Number] = instance; } } var docToOrd = GetNumeric(field); var fst = instance; // per-thread resources var @in = fst.BytesReader; var firstArc = new FST.Arc <long?>(); var scratchArc = new FST.Arc <long?>(); var scratchInts = new IntsRef(); var fstEnum = new BytesRefFSTEnum <long?>(fst); return(new SortedDocValuesAnonymousInnerClassHelper(entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum)); }
/// <summary> /// Returns the strings that can be produced from the given state, or /// false if more than <code>limit</code> strings are found. /// <code>limit</code><0 means "infinite". /// </summary> private static bool GetFiniteStrings(State s, HashSet <State> pathstates, HashSet <IntsRef> strings, IntsRef path, int limit) { pathstates.Add(s); foreach (Transition t in s.Transitions) { if (pathstates.Contains(t.To)) { return(false); } for (int n = t.Min_Renamed; n <= t.Max_Renamed; n++) { path.Grow(path.Length + 1); path.Ints[path.Length] = n; path.Length++; if (t.To.accept) { strings.Add(IntsRef.DeepCopyOf(path)); if (limit >= 0 && strings.Count > limit) { return(false); } } if (!GetFiniteStrings(t.To, pathstates, strings, path, limit)) { return(false); } path.Length--; } } pathstates.Remove(s); return(true); }
public override SortedDocValues GetSorted(FieldInfo field) { FSTEntry entry = Fsts[field.Number]; FST <long?> instance; lock (this) { if (!FstInstances.TryGetValue(field.Number, out instance)) { Data.Seek(entry.Offset); instance = new FST <long?>(Data, PositiveIntOutputs.Singleton); RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes()); FstInstances[field.Number] = instance; } } var docToOrd = GetNumeric(field); var fst = instance; // per-thread resources var @in = fst.BytesReader; var firstArc = new FST.Arc <long?>(); var scratchArc = new FST.Arc <long?>(); var scratchInts = new IntsRef(); var fstEnum = new BytesRefFSTEnum <long?>(fst); return(new SortedDocValuesAnonymousInnerClassHelper(entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum)); }
/// <summary> /// Seeks to smallest term that's >= target. </summary> public InputOutput <T> SeekCeil(IntsRef target) { this.Target = target; TargetLength = target.Length; base.DoSeekCeil(); return(SetResult()); }
public override SortedSetDocValues GetSortedSet(FieldInfo field) { FSTEntry entry = Fsts[field.Number]; if (entry.NumOrds == 0) { return(DocValues.EMPTY_SORTED_SET); // empty FST! } FST <long?> instance; lock (this) { if (!FstInstances.TryGetValue(field.Number, out instance)) { Data.Seek(entry.Offset); instance = new FST <long?>((DataInput)Data, Lucene.Net.Util.Fst.PositiveIntOutputs.Singleton); RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes()); FstInstances[field.Number] = instance; } } BinaryDocValues docToOrds = GetBinary(field); FST <long?> fst = instance; // per-thread resources var @in = fst.BytesReader; var firstArc = new FST.Arc <long?>(); var scratchArc = new FST.Arc <long?>(); var scratchInts = new IntsRef(); var fstEnum = new BytesRefFSTEnum <long?>(fst); var @ref = new BytesRef(); var input = new ByteArrayDataInput(); return(new SortedSetDocValuesAnonymousInnerClassHelper(entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input)); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testSimpleDictionary() throws Exception public virtual void testSimpleDictionary() { System.IO.Stream affixStream = this.GetType().getResourceAsStream("simple.aff"); System.IO.Stream dictStream = this.GetType().getResourceAsStream("simple.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); assertEquals(3, dictionary.lookupSuffix(new char[] { 'e' }, 0, 1).length); assertEquals(1, dictionary.lookupPrefix(new char[] { 's' }, 0, 1).length); IntsRef ordList = dictionary.lookupWord(new char[] { 'o', 'l', 'r' }, 0, 3); assertNotNull(ordList); assertEquals(1, ordList.length); BytesRef @ref = new BytesRef(); dictionary.flagLookup.get(ordList.ints[0], @ref); char[] flags = Dictionary.decodeFlags(@ref); assertEquals(1, flags.Length); ordList = dictionary.lookupWord(new char[] { 'l', 'u', 'c', 'e', 'n' }, 0, 5); assertNotNull(ordList); assertEquals(1, ordList.length); dictionary.flagLookup.get(ordList.ints[0], @ref); flags = Dictionary.decodeFlags(@ref); assertEquals(1, flags.Length); affixStream.Close(); dictStream.Close(); }
public void testBasicFSA() { String[] strings = new String[] { "station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat" }; String[] strings2 = new String[] { "station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation" }; IntsRef[] terms = new IntsRef[strings.Length]; IntsRef[] terms2 = new IntsRef[strings2.Length]; for (int inputMode = 0; inputMode < 1; inputMode++) //TODO: inputMode=2 { log.Debug("> inputMode={inputMode}", inputMode); for (int idx = 0; idx < strings.Length; idx++) { terms[idx] = toIntsRef(strings[idx], inputMode); } for (int idx = 0; idx < strings2.Length; idx++) { terms2[idx] = toIntsRef(strings2[idx], inputMode); } Array.Sort(terms); Array.Sort(terms2); Outputs <Object> outputs = NoOutputs.getSingleton(); Object NO_OUTPUT = outputs.getNoOutput(); List <InputOutput <Object> > pairs = new List <InputOutput <object> >(); foreach (IntsRef term in terms) { pairs.Add(new InputOutput <object>(term, NO_OUTPUT)); } new FSTTester <Object>(r, inputMode, pairs, outputs, false).doTest(); } }
public virtual void TestEmpty() { IntsRef i = new IntsRef(); Assert.AreEqual(IntsRef.EMPTY_INTS, i.Ints); Assert.AreEqual(0, i.Offset); Assert.AreEqual(0, i.Length); }
/// <summary> /// Looks up the output for this input, or null if the /// input is not accepted. /// </summary> public static T Get <T>(FST <T> fst, IntsRef input) { // TODO: would be nice not to alloc this on every lookup var arc = fst.GetFirstArc(new FST <T> .Arc <T>()); var fstReader = fst.BytesReader; // Accumulate output as we go T output = fst.Outputs.NoOutput; for (int i = 0; i < input.Length; i++) { if (fst.FindTargetArc(input.Ints[input.Offset + i], arc, arc, fstReader) == null) { return(default(T)); } output = fst.Outputs.Add(output, arc.Output); } if (arc.Final) { return(fst.Outputs.Add(output, arc.NextFinalOutput)); } else { return(default(T)); } }
private FST <CharsRef> ParseConversions(TextReader reader, int num) { IDictionary <string, string> mappings = new SortedDictionary <string, string>(); for (int i = 0; i < num; i++) { string line = reader.ReadLine(); string[] parts = whitespacePattern.Split(line); if (parts.Length != 3) { throw new Exception("invalid syntax: " + line /*, reader.LineNumber */); // LUCENENET TODO: LineNumberReader } if (mappings.Put(parts[1], parts[2]) != null) { throw new System.InvalidOperationException("duplicate mapping specified for: " + parts[1]); } } Outputs <CharsRef> outputs = CharSequenceOutputs.Singleton; Builder <CharsRef> builder = new Builder <CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); IntsRef scratchInts = new IntsRef(); foreach (KeyValuePair <string, string> entry in mappings) { Lucene.Net.Util.Fst.Util.ToUTF16(entry.Key, scratchInts); builder.Add(scratchInts, new CharsRef(entry.Value)); } return(builder.Finish()); }
/// <summary> /// Builds the final automaton from a list of entries. /// </summary> private FST <object> BuildAutomaton(BytesRefSorter sorter) { // Build the automaton. Outputs <object> outputs = NoOutputs.Singleton; object empty = outputs.NoOutput; Builder <object> builder = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInts.DEFAULT, true, 15); BytesRef scratch = new BytesRef(); BytesRef entry; IntsRef scratchIntsRef = new IntsRef(); int count = 0; BytesRefIterator iter = sorter.GetEnumerator(); while ((entry = iter.Next()) != null) { count++; if (scratch.CompareTo(entry) != 0) { builder.Add(Util.Fst.Util.ToIntsRef(entry, scratchIntsRef), empty); scratch.CopyBytes(entry); } } return(count == 0 ? null : builder.Finish()); }
/// <summary> /// Sole constructor. </summary> public Path(State state, FST.Arc <T> fstNode, T output, IntsRef input) { this.state = state; this.fstNode = fstNode; this.output = output; this.input = input; }
// If back plus this arc is competitive then add to queue: protected virtual void AddIfCompetitive(FSTPath <T> path) { Debug.Assert(Queue != null); T cost = Fst.Outputs.Add(path.Cost, path.Arc.Output); //System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label); if (Queue.Count == MaxQueueDepth) { FSTPath <T> bottom = Queue.Max; int comp = Comparator.Compare(cost, bottom.Cost); if (comp > 0) { // Doesn't compete return; } else if (comp == 0) { // Tie break by alpha sort on the input: path.Input.Grow(path.Input.Length + 1); path.Input.Ints[path.Input.Length++] = path.Arc.Label; int cmp = bottom.Input.CompareTo(path.Input); path.Input.Length--; // We should never see dups: Debug.Assert(cmp != 0); if (cmp < 0) { // Doesn't compete return; } } // Competes } else { // Queue isn't full yet, so any path we hit competes: } // copy over the current input to the new input // and add the arc.label to the end IntsRef newInput = new IntsRef(path.Input.Length + 1); Array.Copy(path.Input.Ints, 0, newInput.Ints, 0, path.Input.Length); newInput.Ints[path.Input.Length] = path.Arc.Label; newInput.Length = path.Input.Length + 1; FSTPath <T> newPath = new FSTPath <T>(cost, path.Arc, newInput); Queue.Add(newPath); if (Queue.Count == MaxQueueDepth + 1) { Queue.Last(); } }
/// <summary> /// Just takes unsigned byte values from the BytesRef and /// converts into an IntsRef. /// </summary> public static IntsRef ToIntsRef(BytesRef input, IntsRef scratch) { scratch.Grow(input.Length); for (int i = 0; i < input.Length; i++) { scratch.Ints[i] = input.Bytes[i + input.Offset] & 0xFF; } scratch.Length = input.Length; return(scratch); }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum <long?> fstEnum) { this.Entry = entry; this.DocToOrd = docToOrd; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; }
private static IntsRef ToIntsRef(string s) { var @ref = new IntsRef(s.Length); // worst case int utf16Len = s.Length; for (int i = 0, cp = 0; i < utf16Len; i += Character.CharCount(cp)) { cp = @ref.Ints[@ref.Length++] = Character.CodePointAt(s, i); } return(@ref); }
private static IntsRef toIntsRef(string s) { IntsRef @ref = new IntsRef(s.Length); // worst case int utf16Len = s.Length; for (int i = 0, cp = 0; i < utf16Len; i += char.charCount(cp)) { cp = @ref.ints[@ref.length++] = char.codePointAt(s, i); } return(@ref); }
public SortedDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, NumericDocValues docToOrd, FST <long> fst, FST <long> .BytesReader @in, FST <long> .Arc <long> firstArc, FST <long> .Arc <long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum <long> fstEnum) { this.OuterInstance = outerInstance; this.Entry = entry; this.DocToOrd = docToOrd; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; }
public virtual void TestFromInts() { int[] ints = new int[] { 1, 2, 3, 4 }; IntsRef i = new IntsRef(ints, 0, 4); Assert.AreEqual(ints, i.Ints); Assert.AreEqual(0, i.Offset); Assert.AreEqual(4, i.Length); IntsRef i2 = new IntsRef(ints, 1, 3); Assert.AreEqual(new IntsRef(new int[] { 2, 3, 4 }, 0, 3), i2); Assert.IsFalse(i.Equals(i2)); }
private static BytesRef ToBytesRef(IntsRef ir) { BytesRef br = new BytesRef(ir.Length); for (int i = 0; i < ir.Length; i++) { int x = ir.Ints[ir.Offset + i]; Debug.Assert(x >= 0 && x <= 255); br.Bytes[i] = (sbyte)x; } br.Length = ir.Length; return(br); }
public virtual void TestFiniteStrings() { Automaton a = BasicOperations.Union(BasicAutomata.MakeString("dog"), BasicAutomata.MakeString("duck")); MinimizationOperations.Minimize(a); ISet<IntsRef> strings = SpecialOperations.GetFiniteStrings(a, -1); Assert.AreEqual(2, strings.Count); IntsRef dog = new IntsRef(); Util.ToIntsRef(new BytesRef("dog"), dog); Assert.IsTrue(strings.Contains(dog)); IntsRef duck = new IntsRef(); Util.ToIntsRef(new BytesRef("duck"), duck); Assert.IsTrue(strings.Contains(duck)); }
internal static IntsRef ToIntsRef(string s, int inputMode, IntsRef ir) { if (inputMode == 0) { // utf8 return(ToIntsRef(new BytesRef(s), ir)); } else { // utf32 return(ToIntsRefUTF32(s, ir)); } }
/// <summary> /// Reverse lookup (lookup by output instead of by input), /// in the special case when your FSTs outputs are /// strictly ascending. this locates the input/output /// pair where the output is equal to the target, and will /// return null if that output does not exist. /// /// <p>NOTE: this only works with {@code FST<Long>}, only /// works when the outputs are ascending in order with /// the inputs. /// For example, simple ordinals (0, 1, /// 2, ...), or file offets (when appending to a file) /// fit this. /// </summary> public static IntsRef GetByOutput(FST <long?> fst, long targetOutput) { var @in = fst.BytesReader; // TODO: would be nice not to alloc this on every lookup FST <long?> .Arc <long?> arc = fst.GetFirstArc(new FST <long?> .Arc <long?>()); FST <long?> .Arc <long?> scratchArc = new FST <long?> .Arc <long?>(); IntsRef result = new IntsRef(); return(GetByOutput(fst, targetOutput, @in, arc, scratchArc, result)); }
/// <summary> /// Just converts IntsRef to BytesRef; you must ensure the /// int values fit into a byte. /// </summary> public static BytesRef ToBytesRef(IntsRef input, BytesRef scratch) { scratch.Grow(input.Length); for (int i = 0; i < input.Length; i++) { int value = input.Ints[i + input.Offset]; // NOTE: we allow -128 to 255 Debug.Assert(value >= sbyte.MinValue && value <= 255, "value " + value + " doesn't fit into byte"); scratch.Bytes[i] = (byte)value; } scratch.Length = input.Length; return(scratch); }
/// <summary> /// Just maps each UTF16 unit (char) to the ints in an /// IntsRef. /// </summary> public static IntsRef ToUTF16(string s, IntsRef scratch) { int charLimit = s.Length; scratch.Offset = 0; scratch.Length = charLimit; scratch.Grow(charLimit); for (int idx = 0; idx < charLimit; idx++) { scratch.Ints[idx] = (int)s[idx]; } return(scratch); }
internal static IntsRef ToIntsRef(BytesRef br, IntsRef ir) { if (br.Length > ir.Ints.Length) { ir.Grow(br.Length); } for (int i = 0; i < br.Length; i++) { ir.Ints[i] = br.Bytes[br.Offset + i] & 0xFF; } ir.Length = br.Length; return(ir); }
private void Count(IList<FacetsCollector.MatchingDocs> matchingDocs) { IntsRef scratch = new IntsRef(); foreach (FacetsCollector.MatchingDocs hits in matchingDocs) { OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.GetReader(hits.context); DocIdSetIterator docs = hits.bits.GetIterator(); int doc; while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { ords.Get(doc, scratch); for (int i = 0; i < scratch.Length; i++) { values[scratch.Ints[scratch.Offset + i]]++; } } } Rollup(); }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, NumericDocValues numericDocValues, FST<long?> fst1, FST.BytesReader @in, FST.Arc<long?> arc, FST.Arc<long?> scratchArc1, IntsRef intsRef, BytesRefFSTEnum<long?> bytesRefFstEnum) { entry = fstEntry; docToOrd = numericDocValues; fst = fst1; this.@in = @in; firstArc = arc; scratchArc = scratchArc1; scratchInts = intsRef; fstEnum = bytesRefFstEnum; }
/// <summary> /// Builds the final automaton from a list of entries. /// </summary> private FST<object> BuildAutomaton(BytesRefSorter sorter) { // Build the automaton. Outputs<object> outputs = NoOutputs.Singleton; object empty = outputs.NoOutput; Builder<object> builder = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInts.DEFAULT, true, 15); BytesRef scratch = new BytesRef(); BytesRef entry; IntsRef scratchIntsRef = new IntsRef(); int count = 0; BytesRefIterator iter = sorter.GetEnumerator(); while ((entry = iter.Next()) != null) { count++; if (scratch.CompareTo(entry) != 0) { builder.Add(Util.Fst.Util.ToIntsRef(entry, scratchIntsRef), empty); scratch.CopyBytes(entry); } } return count == 0 ? null : builder.Finish(); }
public override SortedSetDocValues GetSortedSet(FieldInfo field) { var entry = fsts[field.Number]; if (entry.numOrds == 0) { return DocValues.EMPTY_SORTED_SET; // empty FST! } FST<long?> instance; lock (this) { instance = fstInstances[field.Number]; if (instance == null) { data.Seek(entry.offset); instance = new FST<long?>(data, PositiveIntOutputs.Singleton); ramBytesUsed.AddAndGet(instance.SizeInBytes()); fstInstances[field.Number] = instance; } } var docToOrds = GetBinary(field); var fst = instance; // per-thread resources var @in = fst.BytesReader; var firstArc = new FST.Arc<long?>(); var scratchArc = new FST.Arc<long?>(); var scratchInts = new IntsRef(); var fstEnum = new BytesRefFSTEnum<long?>(fst); var @ref = new BytesRef(); var input = new ByteArrayDataInput(); return new SortedSetDocValuesAnonymousInnerClassHelper(entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input); }
/// <summary> /// Builds an <seealso cref="SynonymMap"/> and returns it. /// </summary> public virtual SynonymMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? var builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); HashSet<int?> dedupSet; if (dedup) { dedupSet = new HashSet<int?>(); } else { dedupSet = null; } var spare = new sbyte[5]; Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys; CharsRef[] sortedKeys = keys.ToArray(); Arrays.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator); IntsRef scratchIntsRef = new IntsRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.Grow(estimatedSize); scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length); Debug.Assert(scratch.Offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once int? ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.WriteVInt(output.ords[i]); count++; } int pos = scratchOutput.Position; scratchOutput.WriteVInt(count << 1 | (output.includeOrig ? 0 : 1)); int pos2 = scratchOutput.Position; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.Length = scratchOutput.Position - scratch.Offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.Add(Util.ToUTF32(input, scratchIntsRef), BytesRef.DeepCopyOf(scratch)); } FST<BytesRef> fst = builder.Finish(); return new SynonymMap(fst, words, maxHorizontalContext); }
private void LoadTermsIndex() { if (Fst != null) return; var clone = (IndexInput) _vgtir._input.Clone(); clone.Seek(_indexStart); Fst = new FST<long?>(clone, _vgtir._fstOutputs); clone.Dispose(); /* final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); Util.toDot(fst, w, false, false); System.out.println("FST INDEX: SAVED to " + dotFileName); w.close(); */ if (_vgtir._indexDivisor > 1) { // subsample var scratchIntsRef = new IntsRef(); var outputs = PositiveIntOutputs.Singleton; var builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); var fstEnum = new BytesRefFSTEnum<long?>(Fst); var count = _vgtir._indexDivisor; BytesRefFSTEnum<long?>.InputOutput<long?> result; while ((result = fstEnum.Next()) != null) { if (count == _vgtir._indexDivisor) { builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef), result.Output); count = 0; } count++; } Fst = builder.Finish(); } }
/// <summary> /// Returns the strings that can be produced from the given state, or /// false if more than <code>limit</code> strings are found. /// <code>limit</code><0 means "infinite". /// </summary> private static bool GetFiniteStrings(State s, HashSet<State> pathstates, HashSet<IntsRef> strings, IntsRef path, int limit) { pathstates.Add(s); foreach (Transition t in s.Transitions) { if (pathstates.Contains(t.To)) { return false; } for (int n = t.Min_Renamed; n <= t.Max_Renamed; n++) { path.Grow(path.Length + 1); path.Ints[path.Length] = n; path.Length++; if (t.To.accept) { strings.Add(IntsRef.DeepCopyOf(path)); if (limit >= 0 && strings.Count > limit) { return false; } } if (!GetFiniteStrings(t.To, pathstates, strings, path, limit)) { return false; } path.Length--; } } pathstates.Remove(s); return true; }
public override void Get(int docID, IntsRef ordinals) { ordinals.Ints = cachedOrds.ordinals; ordinals.Offset = cachedOrds.offsets[docID]; ordinals.Length = cachedOrds.offsets[docID + 1] - ordinals.Offset; }
/// <summary> /// Builds the NormalizeCharMap; call this once you /// are done calling <seealso cref="#add"/>. /// </summary> public virtual NormalizeCharMap build() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.fst.FST<org.apache.lucene.util.CharsRef> map; FST<CharsRef> map; try { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.fst.Outputs<org.apache.lucene.util.CharsRef> outputs = org.apache.lucene.util.fst.CharSequenceOutputs.getSingleton(); Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.fst.Builder<org.apache.lucene.util.CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(org.apache.lucene.util.fst.FST.INPUT_TYPE.BYTE2, outputs); Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratch = new org.apache.lucene.util.IntsRef(); IntsRef scratch = new IntsRef(); foreach (KeyValuePair<string, string> ent in pendingPairs.SetOfKeyValuePairs()) { builder.add(Util.toUTF16(ent.Key, scratch), new CharsRef(ent.Value)); } map = builder.finish(); pendingPairs.Clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new Exception(ioe); } return new NormalizeCharMap(map); }
public SortedSetDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, BinaryDocValues docToOrds, FST<long> fst, FST<long>.BytesReader @in, FST<long>.Arc<long> firstArc, FST<long>.Arc<long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long> fstEnum, BytesRef @ref, ByteArrayDataInput input) { this.OuterInstance = outerInstance; this.Entry = entry; this.DocToOrds = docToOrds; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; this.@ref = @ref; this.Input = input; }
public override SortedSetDocValues GetSortedSet(FieldInfo field) { FSTEntry entry = Fsts[field.Number]; if (entry.NumOrds == 0) { return DocValues.EMPTY_SORTED_SET; // empty FST! } FST<long> instance; lock (this) { if (!FstInstances.TryGetValue(field.Number, out instance)) { Data.Seek(entry.Offset); instance = new FST<long>((DataInput)Data, Lucene.Net.Util.Fst.PositiveIntOutputs.Singleton); RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes()); FstInstances[field.Number] = instance; } } BinaryDocValues docToOrds = GetBinary(field); FST<long> fst = instance; // per-thread resources FST<long>.BytesReader @in = fst.BytesReader; FST<long>.Arc<long> firstArc = new FST<long>.Arc<long>(); FST<long>.Arc<long> scratchArc = new FST<long>.Arc<long>(); IntsRef scratchInts = new IntsRef(); BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst); BytesRef @ref = new BytesRef(); ByteArrayDataInput input = new ByteArrayDataInput(); return new SortedSetDocValuesAnonymousInnerClassHelper(this, entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input); }
/// <summary> /// Returns an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </summary> /// <returns> an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </returns> /// <exception cref="IOException"> if an <seealso cref="IOException"/> occurs; </exception> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public StemmerOverrideMap build() throws java.io.IOException public virtual StemmerOverrideMap build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int[] sort = hash.sort(org.apache.lucene.util.BytesRef.getUTF8SortedAsUnicodeComparator()); int[] sort = hash.sort(BytesRef.UTF8SortedAsUnicodeComparator); IntsRef intsSpare = new IntsRef(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int size = hash.size(); int size = hash.size(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare); builder.add(intsSpare, new BytesRef(outputValues[id])); } return new StemmerOverrideMap(builder.finish(), ignoreCase); }
/// <summary> /// Returns an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </summary> /// <returns> an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </returns> /// <exception cref="IOException"> if an <seealso cref="IOException"/> occurs; </exception> public virtual StemmerOverrideMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; Lucene.Net.Util.Fst.Builder<BytesRef> builder = new Lucene.Net.Util.Fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); IntsRef intsSpare = new IntsRef(); int size = hash.Size(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.Get(id, spare); UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare); builder.Add(intsSpare, new BytesRef(outputValues[id])); } return new StemmerOverrideMap(builder.Finish(), ignoreCase); }
private static IntsRef ToIntsRef(string s) { var @ref = new IntsRef(s.Length); // worst case int utf16Len = s.Length; for (int i = 0, cp = 0; i < utf16Len; i += Character.CharCount(cp)) { cp = @ref.Ints[@ref.Length++] = Character.CodePointAt(s, i); } return @ref; }
public SortedDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, NumericDocValues docToOrd, FST<long> fst, FST<long>.BytesReader @in, FST<long>.Arc<long> firstArc, FST<long>.Arc<long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long> fstEnum) { this.OuterInstance = outerInstance; this.Entry = entry; this.DocToOrd = docToOrd; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; }
/// <summary> /// Builds an <seealso cref="SynonymMap"/> and returns it. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public SynonymMap build() throws java.io.IOException public virtual SynonymMap build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.Set<Integer> dedupSet; HashSet<int?> dedupSet; if (dedup) { dedupSet = new HashSet<>(); } else { dedupSet = null; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final byte[] spare = new byte[5]; sbyte[] spare = new sbyte[5]; Dictionary<CharsRef, MapEntry>.KeyCollection keys = workingSet.Keys; CharsRef[] sortedKeys = keys.toArray(new CharsRef[keys.size()]); Arrays.sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparator); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratchIntsRef = new org.apache.lucene.util.IntsRef(); IntsRef scratchIntsRef = new IntsRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.grow(estimatedSize); scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); Debug.Assert(scratch.offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final Integer ent = output.ords.get(i); int? ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.writeVInt(output.ords[i]); count++; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int pos = scratchOutput.getPosition(); int pos = scratchOutput.Position; scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int pos2 = scratchOutput.getPosition(); int pos2 = scratchOutput.Position; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int vIntLen = pos2-pos; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.bytes, 0, scratch.bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.length = scratchOutput.Position - scratch.offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch)); } FST<BytesRef> fst = builder.finish(); return new SynonymMap(fst, words, maxHorizontalContext); }
public virtual void TestUTF8toUTF32() { BytesRef utf8 = new BytesRef(20); IntsRef utf32 = new IntsRef(20); int[] codePoints = new int[20]; int num = AtLeast(50000); for (int i = 0; i < num; i++) { string s = TestUtil.RandomUnicodeString(Random()); UnicodeUtil.UTF16toUTF8(s.ToCharArray(), 0, s.Length, utf8); UnicodeUtil.UTF8toUTF32(utf8, utf32); int charUpto = 0; int intUpto = 0; while (charUpto < s.Length) { int cp = Character.CodePointAt(s, charUpto); codePoints[intUpto++] = cp; charUpto += Character.CharCount(cp); } if (!ArrayUtil.Equals(codePoints, 0, utf32.Ints, utf32.Offset, intUpto)) { Console.WriteLine("FAILED"); for (int j = 0; j < s.Length; j++) { Console.WriteLine(" char[" + j + "]=" + ((int)s[j]).ToString("x")); } Console.WriteLine(); Assert.AreEqual(intUpto, utf32.Length); for (int j = 0; j < intUpto; j++) { Console.WriteLine(" " + utf32.Ints[j].ToString("x") + " vs " + codePoints[j].ToString("x")); } Assert.Fail("mismatch"); } } }
/// <summary> /// Builds the NormalizeCharMap; call this once you /// are done calling <seealso cref="#add"/>. /// </summary> public virtual NormalizeCharMap Build() { FST<CharsRef> map; try { Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton; Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); IntsRef scratch = new IntsRef(); foreach (var ent in pendingPairs) { builder.Add(Lucene.Net.Util.Fst.Util.ToUTF16(ent.Key, scratch), new CharsRef(ent.Value)); } map = builder.Finish(); pendingPairs.Clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new Exception("Should never happen", ioe); } return new NormalizeCharMap(map); }
public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, BinaryDocValues binaryDocValues, FST<long?> fst1, FST.BytesReader @in, FST.Arc<long?> arc, FST.Arc<long?> scratchArc1, IntsRef intsRef, BytesRefFSTEnum<long?> bytesRefFstEnum, BytesRef @ref, ByteArrayDataInput byteArrayDataInput) { entry = fstEntry; docToOrds = binaryDocValues; fst = fst1; this.@in = @in; firstArc = arc; scratchArc = scratchArc1; scratchInts = intsRef; fstEnum = bytesRefFstEnum; this.@ref = @ref; input = byteArrayDataInput; }
public virtual void Test() { int[] ints = new int[7]; IntsRef input = new IntsRef(ints, 0, ints.Length); int seed = Random().Next(); Directory dir = new MMapDirectory(CreateTempDir("2BFST")); for (int doPackIter = 0; doPackIter < 2; doPackIter++) { bool doPack = doPackIter == 1; // Build FST w/ NoOutputs and stop when nodeCount > 2.2B if (!doPack) { Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); Outputs<object> outputs = NoOutputs.Singleton; object NO_OUTPUT = outputs.NoOutput; Builder<object> b = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15); int count = 0; Random r = new Random(seed); int[] ints2 = new int[200]; IntsRef input2 = new IntsRef(ints2, 0, ints2.Length); while (true) { //System.out.println("add: " + input + " -> " + output); for (int i = 10; i < ints2.Length; i++) { ints2[i] = r.Next(256); } b.Add(input2, NO_OUTPUT); count++; if (count % 100000 == 0) { Console.WriteLine(count + ": " + b.FstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes"); } if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024) { break; } NextInput(r, ints2); } FST<object> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints2, 0); r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2)); NextInput(r, ints2); } Console.WriteLine("\nTEST: enum all input/outputs"); IntsRefFSTEnum<object> fstEnum = new IntsRefFSTEnum<object>(fst); Arrays.Fill(ints2, 0); r = new Random(seed); int upto = 0; while (true) { IntsRefFSTEnum<object>.InputOutput<object> pair = fstEnum.Next(); if (pair == null) { break; } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(input2, pair.Input); Assert.AreEqual(NO_OUTPUT, pair.Output); upto++; NextInput(r, ints2); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST<object>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ ByteSequenceOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes"); Outputs<BytesRef> outputs = ByteSequenceOutputs.Singleton; Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15); var outputBytes = new byte[20]; BytesRef output = new BytesRef(outputBytes); Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { r.NextBytes(outputBytes); //System.out.println("add: " + input + " -> " + output); b.Add(input, BytesRef.DeepCopyOf(output)); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes"); } if (b.FstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST<BytesRef> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); r = new Random(seed); Arrays.Fill(ints, 0); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } r.NextBytes((byte[])(Array)outputBytes); Assert.AreEqual(output, Util.Get(fst, input)); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; while (true) { IntsRefFSTEnum<BytesRef>.InputOutput<BytesRef> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); r.NextBytes((byte[])(Array)outputBytes); Assert.AreEqual(output, pair.Output); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST<BytesRef>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ PositiveIntOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); Outputs<long?> outputs = PositiveIntOutputs.Singleton; Builder<long?> b = new Builder<long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15); long output = 1; Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { //System.out.println("add: " + input + " -> " + output); b.Add(input, output); output += 1 + r.Next(10); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes"); } if (b.FstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST<long?> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints, 0); output = 1; r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } // forward lookup: Assert.AreEqual(output, (long)Util.Get(fst, input)); // reverse lookup: Assert.AreEqual(input, Util.GetByOutput(fst, output)); output += 1 + r.Next(10); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); IntsRefFSTEnum<long?> fstEnum = new IntsRefFSTEnum<long?>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; output = 1; while (true) { IntsRefFSTEnum<long?>.InputOutput<long?> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); Assert.AreEqual(output, pair.Output.Value); output += 1 + r.Next(10); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST<long?>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } } dir.Dispose(); }
/// <summary> /// <p>this method assumes valid UTF8 input. this method /// <strong>does not perform</strong> full UTF8 validation, it will check only the /// first byte of each codepoint (for multi-byte sequences any bytes after /// the head are skipped). /// </summary> /// <exception cref="IllegalArgumentException"> If invalid codepoint header byte occurs or the /// content is prematurely truncated. </exception> public static void UTF8toUTF32(BytesRef utf8, IntsRef utf32) { // TODO: broken if incoming result.offset != 0 // pre-alloc for worst case // TODO: ints cannot be null, should be an assert if (utf32.Ints == null || utf32.Ints.Length < utf8.Length) { utf32.Ints = new int[utf8.Length]; } int utf32Count = 0; int utf8Upto = utf8.Offset; int[] ints = utf32.Ints; var bytes = utf8.Bytes; int utf8Limit = utf8.Offset + utf8.Length; while (utf8Upto < utf8Limit) { int numBytes = Utf8CodeLength[bytes[utf8Upto] & 0xFF]; int v = 0; switch (numBytes) { case 1: ints[utf32Count++] = bytes[utf8Upto++]; continue; case 2: // 5 useful bits v = bytes[utf8Upto++] & 31; break; case 3: // 4 useful bits v = bytes[utf8Upto++] & 15; break; case 4: // 3 useful bits v = bytes[utf8Upto++] & 7; break; default: throw new System.ArgumentException("invalid utf8"); } // TODO: this may read past utf8's limit. int limit = utf8Upto + numBytes - 1; while (utf8Upto < limit) { v = v << 6 | bytes[utf8Upto++] & 63; } ints[utf32Count++] = v; } utf32.Offset = 0; utf32.Length = utf32Count; }
/// <summary> /// Creates a new <seealso cref="CachedOrds"/> from the <seealso cref="BinaryDocValues"/>. /// Assumes that the <seealso cref="BinaryDocValues"/> is not {@code null}. /// </summary> public CachedOrds(OrdinalsSegmentReader source, int maxDoc) { offsets = new int[maxDoc + 1]; int[] ords = new int[maxDoc]; // let's assume one ordinal per-document as an initial size // this aggregator is limited to Integer.MAX_VALUE total ordinals. long totOrds = 0; IntsRef values = new IntsRef(32); for (int docID = 0; docID < maxDoc; docID++) { offsets[docID] = (int)totOrds; source.Get(docID, values); long nextLength = totOrds + values.Length; if (nextLength > ords.Length) { if (nextLength > ArrayUtil.MAX_ARRAY_LENGTH) { throw new ThreadStateException("too many ordinals (>= " + nextLength + ") to cache"); } ords = ArrayUtil.Grow(ords, (int)nextLength); } Array.Copy(values.Ints, 0, ords, (int)totOrds, values.Length); totOrds = nextLength; } offsets[maxDoc] = (int)totOrds; // if ords array is bigger by more than 10% of what we really need, shrink it if ((double)totOrds / ords.Length < 0.9) { this.ordinals = new int[(int)totOrds]; Array.Copy(ords, 0, this.ordinals, 0, (int)totOrds); } else { this.ordinals = ords; } }
public override SortedDocValues GetSorted(FieldInfo field) { FSTEntry entry = Fsts[field.Number]; FST<long> instance; lock (this) { if (!FstInstances.TryGetValue(field.Number, out instance)) { Data.Seek(entry.Offset); instance = new FST<long>(Data, PositiveIntOutputs.Singleton); RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes()); FstInstances[field.Number] = instance; } } NumericDocValues docToOrd = GetNumeric(field); FST<long> fst = instance; // per-thread resources FST<long>.BytesReader @in = fst.BytesReader; FST<long>.Arc<long> firstArc = new FST<long>.Arc<long>(); FST<long>.Arc<long> scratchArc = new FST<long>.Arc<long>(); IntsRef scratchInts = new IntsRef(); BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst); return new SortedDocValuesAnonymousInnerClassHelper(this, entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum); }
private void SumValues(IList<MatchingDocs> matchingDocs, bool keepScores, ValueSource valueSource) { FakeScorer scorer = new FakeScorer(); IDictionary context = new Dictionary<string, Scorer>(); if (keepScores) { context["scorer"] = scorer; } IntsRef scratch = new IntsRef(); foreach (MatchingDocs hits in matchingDocs) { OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.GetReader(hits.Context); int scoresIdx = 0; float[] scores = hits.Scores; FunctionValues functionValues = valueSource.GetValues(context, hits.Context); DocIdSetIterator docs = hits.Bits.GetIterator(); int doc; while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { ords.Get(doc, scratch); if (keepScores) { scorer.docID_Renamed = doc; scorer.score_Renamed = scores[scoresIdx++]; } float value = (float)functionValues.DoubleVal(doc); for (int i = 0; i < scratch.Length; i++) { values[scratch.Ints[i]] += value; } } } Rollup(); }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST<long?> fst, FST<long?>.BytesReader @in, FST<long?>.Arc<long?> firstArc, FST<long?>.Arc<long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long?> fstEnum) { this.Entry = entry; this.DocToOrd = docToOrd; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; }
public virtual void TestReplacements() { Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton; Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); IntsRef scratchInts = new IntsRef(); // a -> b Lucene.Net.Util.Fst.Util.ToUTF16("a", scratchInts); builder.Add(scratchInts, new CharsRef("b")); // ab -> c Lucene.Net.Util.Fst.Util.ToUTF16("ab", scratchInts); builder.Add(scratchInts, new CharsRef("c")); // c -> de Lucene.Net.Util.Fst.Util.ToUTF16("c", scratchInts); builder.Add(scratchInts, new CharsRef("de")); // def -> gh Lucene.Net.Util.Fst.Util.ToUTF16("def", scratchInts); builder.Add(scratchInts, new CharsRef("gh")); FST<CharsRef> fst = builder.Finish(); StringBuilder sb = new StringBuilder("atestanother"); Dictionary.ApplyMappings(fst, sb); assertEquals("btestbnother", sb.ToString()); sb = new StringBuilder("abtestanother"); Dictionary.ApplyMappings(fst, sb); assertEquals("ctestbnother", sb.ToString()); sb = new StringBuilder("atestabnother"); Dictionary.ApplyMappings(fst, sb); assertEquals("btestcnother", sb.ToString()); sb = new StringBuilder("abtestabnother"); Dictionary.ApplyMappings(fst, sb); assertEquals("ctestcnother", sb.ToString()); sb = new StringBuilder("abtestabcnother"); Dictionary.ApplyMappings(fst, sb); assertEquals("ctestcdenother", sb.ToString()); sb = new StringBuilder("defdefdefc"); Dictionary.ApplyMappings(fst, sb); assertEquals("ghghghde", sb.ToString()); }
private void loadTermsIndex() { if (Fst == null) { IndexInput clone = input.Clone(); clone.Seek(indexStart); Fst = new FST<>(clone, fstOutputs); clone.Close(); /* final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); Util.toDot(fst, w, false, false); System.out.println("FST INDEX: SAVED to " + dotFileName); w.close(); */ if (indexDivisor > 1) { // subsample IntsRef scratchIntsRef = new IntsRef(); PositiveIntOutputs outputs = PositiveIntOutputs.GetSingleton(); Builder<long> builder = new Builder<long>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst); BytesRefFSTEnum.InputOutput<long> result; int count = indexDivisor; while ((result = fstEnum.Next()) != null) { if (count == indexDivisor) { builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef), result.Output); count = 0; } count++; } Fst = builder.Finish(); } } }