Ejemplo n.º 1
0
 /// <summary>
 /// Returns the strings that can be produced from the given state, or
 /// false if more than <code>limit</code> strings are found.
 /// <code>limit</code>&lt;0 means "infinite".
 /// </summary>
 private static bool GetFiniteStrings(State s, HashSet <State> pathstates, HashSet <IntsRef> strings, IntsRef path, int limit)
 {
     pathstates.Add(s);
     foreach (Transition t in s.Transitions)
     {
         if (pathstates.Contains(t.To))
         {
             return(false);
         }
         for (int n = t.Min_Renamed; n <= t.Max_Renamed; n++)
         {
             path.Grow(path.Length + 1);
             path.Ints[path.Length] = n;
             path.Length++;
             if (t.To.accept)
             {
                 strings.Add(IntsRef.DeepCopyOf(path));
                 if (limit >= 0 && strings.Count > limit)
                 {
                     return(false);
                 }
             }
             if (!GetFiniteStrings(t.To, pathstates, strings, path, limit))
             {
                 return(false);
             }
             path.Length--;
         }
     }
     pathstates.Remove(s);
     return(true);
 }
Ejemplo n.º 2
0
        /// <summary>
        /// Increments the byte buffer to the next String in binary order after s that will not put
        /// the machine into a reject state. If such a string does not exist, returns
        /// false.
        ///
        /// The correctness of this method depends upon the automaton being deterministic,
        /// and having no transitions to dead states.
        /// </summary>
        /// <returns> true if more possible solutions exist for the DFA </returns>
        private bool NextString()
        {
            int state;
            int pos = 0;

            SavedStates.Grow(SeekBytesRef.Length + 1);
            int[] states = SavedStates.Ints;
            states[0] = RunAutomaton.InitialState;

            while (true)
            {
                CurGen++;
                Linear_Renamed = false;
                // walk the automaton until a character is rejected.
                for (state = states[pos]; pos < SeekBytesRef.Length; pos++)
                {
                    Visited[state] = CurGen;
                    int nextState = RunAutomaton.Step(state, SeekBytesRef.Bytes[pos] & 0xff);
                    if (nextState == -1)
                    {
                        break;
                    }
                    states[pos + 1] = nextState;
                    // we found a loop, record it for faster enumeration
                    if ((Finite == false) && !Linear_Renamed && Visited[nextState] == CurGen)
                    {
                        Linear = pos;
                    }
                    state = nextState;
                }

                // take the useful portion, and the last non-reject state, and attempt to
                // append characters that will match.
                if (NextString(state, pos))
                {
                    return(true);
                } // no more solutions exist from this useful portion, backtrack
                else
                {
                    if ((pos = Backtrack(pos)) < 0) // no more solutions at all
                    {
                        return(false);
                    }
                    int newState = RunAutomaton.Step(states[pos], SeekBytesRef.Bytes[pos] & 0xff);
                    if (newState >= 0 && RunAutomaton.IsAccept(newState))
                    /* String is good to go as-is */
                    {
                        return(true);
                    }
                    /* else advance further */
                    // TODO: paranoia? if we backtrack thru an infinite DFA, the loop detection is important!
                    // for now, restart from scratch for all infinite DFAs
                    if (Finite == false)
                    {
                        pos = 0;
                    }
                }
            }
        }
Ejemplo n.º 3
0
 /// <summary>
 /// Just takes unsigned byte values from the BytesRef and
 ///  converts into an IntsRef.
 /// </summary>
 public static IntsRef ToIntsRef(BytesRef input, IntsRef scratch)
 {
     scratch.Grow(input.Length);
     for (int i = 0; i < input.Length; i++)
     {
         scratch.Ints[i] = input.Bytes[i + input.Offset] & 0xFF;
     }
     scratch.Length = input.Length;
     return(scratch);
 }
Ejemplo n.º 4
0
 internal static IntsRef ToIntsRef(BytesRef br, IntsRef ir)
 {
     if (br.Length > ir.Ints.Length)
     {
         ir.Grow(br.Length);
     }
     for (int i = 0; i < br.Length; i++)
     {
         ir.Ints[i] = br.Bytes[br.Offset + i] & 0xFF;
     }
     ir.Length = br.Length;
     return(ir);
 }
Ejemplo n.º 5
0
        /// <summary>
        /// Just maps each UTF16 unit (char) to the ints in an
        ///  IntsRef.
        /// </summary>
        public static IntsRef ToUTF16(string s, IntsRef scratch)
        {
            int charLimit = s.Length;

            scratch.Offset = 0;
            scratch.Length = charLimit;
            scratch.Grow(charLimit);
            for (int idx = 0; idx < charLimit; idx++)
            {
                scratch.Ints[idx] = (int)s[idx];
            }
            return(scratch);
        }
Ejemplo n.º 6
0
        private T RandomAcceptedWord(FST <T> fst, IntsRef @in)
        {
            FST.Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>());

            IList <FST.Arc <T> > arcs = new List <FST.Arc <T> >();

            @in.Length = 0;
            @in.Offset = 0;
            T NO_OUTPUT = fst.Outputs.NoOutput;
            T output    = NO_OUTPUT;

            FST.BytesReader fstReader = fst.BytesReader;

            while (true)
            {
                // read all arcs:
                fst.ReadFirstTargetArc(arc, arc, fstReader);
                arcs.Add((new FST.Arc <T>()).CopyFrom(arc));
                while (!arc.Last)
                {
                    fst.ReadNextArc(arc, fstReader);
                    arcs.Add((new FST.Arc <T>()).CopyFrom(arc));
                }

                // pick one
                arc = arcs[Random.Next(arcs.Count)];
                arcs.Clear();

                // accumulate output
                output = fst.Outputs.Add(output, arc.Output);

                // append label
                if (arc.Label == FST <T> .END_LABEL)
                {
                    break;
                }

                if (@in.Ints.Length == @in.Length)
                {
                    @in.Grow(1 + @in.Length);
                }
                @in.Ints[@in.Length++] = arc.Label;
            }

            return(output);
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Decodes the Unicode codepoints from the provided
        ///  CharSequence and places them in the provided scratch
        ///  IntsRef, which must not be null, returning it.
        /// </summary>
        public static IntsRef ToUTF32(string s, IntsRef scratch)
        {
            int charIdx   = 0;
            int intIdx    = 0;
            int charLimit = s.Length;

            while (charIdx < charLimit)
            {
                scratch.Grow(intIdx + 1);
                int utf32 = Character.CodePointAt(s, charIdx);
                scratch.Ints[intIdx] = utf32;
                charIdx += Character.CharCount(utf32);
                intIdx++;
            }
            scratch.Length = intIdx;
            return(scratch);
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Decodes the Unicode codepoints from the provided
        ///  char[] and places them in the provided scratch
        ///  IntsRef, which must not be null, returning it.
        /// </summary>
        public static IntsRef ToUTF32(char[] s, int offset, int length, IntsRef scratch)
        {
            int charIdx   = offset;
            int intIdx    = 0;
            int charLimit = offset + length;

            while (charIdx < charLimit)
            {
                scratch.Grow(intIdx + 1);
                int utf32 = Character.CodePointAt(s, charIdx, charLimit);
                scratch.Ints[intIdx] = utf32;
                charIdx += Character.CharCount(utf32);
                intIdx++;
            }
            scratch.Length = intIdx;
            return(scratch);
        }
Ejemplo n.º 9
0
        internal static IntsRef ToIntsRefUTF32(string s, IntsRef ir)
        {
            int charLength = s.Length;
            int charIdx    = 0;
            int intIdx     = 0;

            while (charIdx < charLength)
            {
                if (intIdx == ir.Ints.Length)
                {
                    ir.Grow(intIdx + 1);
                }
                int utf32 = Character.CodePointAt(s, charIdx);
                ir.Ints[intIdx] = utf32;
                charIdx        += Character.CharCount(utf32);
                intIdx++;
            }
            ir.Length = intIdx;
            return(ir);
        }
Ejemplo n.º 10
0
 /// <summary>
 /// Returns the strings that can be produced from the given state, or
 /// false if more than <code>limit</code> strings are found.
 /// <code>limit</code>&lt;0 means "infinite".
 /// </summary>
 private static bool GetFiniteStrings(State s, HashSet<State> pathstates, HashSet<IntsRef> strings, IntsRef path, int limit)
 {
     pathstates.Add(s);
     foreach (Transition t in s.Transitions)
     {
         if (pathstates.Contains(t.To))
         {
             return false;
         }
         for (int n = t.Min_Renamed; n <= t.Max_Renamed; n++)
         {
             path.Grow(path.Length + 1);
             path.Ints[path.Length] = n;
             path.Length++;
             if (t.To.accept)
             {
                 strings.Add(IntsRef.DeepCopyOf(path));
                 if (limit >= 0 && strings.Count > limit)
                 {
                     return false;
                 }
             }
             if (!GetFiniteStrings(t.To, pathstates, strings, path, limit))
             {
                 return false;
             }
             path.Length--;
         }
     }
     pathstates.Remove(s);
     return true;
 }
Ejemplo n.º 11
0
        /// <summary>
        /// Expert: like <seealso cref="Util#getByOutput(FST, long)"/> except reusing
        /// BytesReader, initial and scratch Arc, and result.
        /// </summary>
        public static IntsRef GetByOutput(FST <long?> fst, long targetOutput, FST <long?> .BytesReader @in, FST <long?> .Arc <long?> arc, FST <long?> .Arc <long?> scratchArc, IntsRef result)
        {
            long output = arc.Output.Value;
            int  upto   = 0;

            //System.out.println("reverseLookup output=" + targetOutput);

            while (true)
            {
                //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc);
                if (arc.Final)
                {
                    long finalOutput = output + arc.NextFinalOutput.Value;
                    //System.out.println("  isFinal finalOutput=" + finalOutput);
                    if (finalOutput == targetOutput)
                    {
                        result.Length = upto;
                        //System.out.println("    found!");
                        return(result);
                    }
                    else if (finalOutput > targetOutput)
                    {
                        //System.out.println("    not found!");
                        return(null);
                    }
                }

                if (FST <long?> .TargetHasArcs(arc))
                {
                    //System.out.println("  targetHasArcs");
                    if (result.Ints.Length == upto)
                    {
                        result.Grow(1 + upto);
                    }

                    fst.ReadFirstRealTargetArc(arc.Target, arc, @in);

                    if (arc.BytesPerArc != 0)
                    {
                        int low  = 0;
                        int high = arc.NumArcs - 1;
                        int mid  = 0;
                        //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output);
                        bool exact = false;
                        while (low <= high)
                        {
                            mid          = (int)((uint)(low + high) >> 1);
                            @in.Position = arc.PosArcsStart;
                            @in.SkipBytes(arc.BytesPerArc * mid);
                            var flags = (sbyte)@in.ReadByte();
                            fst.ReadLabel(@in);
                            long minArcOutput;
                            if ((flags & FST <long> .BIT_ARC_HAS_OUTPUT) != 0)
                            {
                                long arcOutput = fst.Outputs.Read(@in).Value;
                                minArcOutput = output + arcOutput;
                            }
                            else
                            {
                                minArcOutput = output;
                            }
                            if (minArcOutput == targetOutput)
                            {
                                exact = true;
                                break;
                            }
                            else if (minArcOutput < targetOutput)
                            {
                                low = mid + 1;
                            }
                            else
                            {
                                high = mid - 1;
                            }
                        }

                        if (high == -1)
                        {
                            return(null);
                        }
                        else if (exact)
                        {
                            arc.ArcIdx = mid - 1;
                        }
                        else
                        {
                            arc.ArcIdx = low - 2;
                        }

                        fst.ReadNextRealArc(arc, @in);
                        result.Ints[upto++] = arc.Label;
                        output += arc.Output.Value;
                    }
                    else
                    {
                        FST <long?> .Arc <long?> prevArc = null;

                        while (true)
                        {
                            //System.out.println("    cycle label=" + arc.label + " output=" + arc.output);

                            // this is the min output we'd hit if we follow
                            // this arc:
                            long minArcOutput = output + arc.Output.Value;

                            if (minArcOutput == targetOutput)
                            {
                                // Recurse on this arc:
                                //System.out.println("  match!  break");
                                output = minArcOutput;
                                result.Ints[upto++] = arc.Label;
                                break;
                            }
                            else if (minArcOutput > targetOutput)
                            {
                                if (prevArc == null)
                                {
                                    // Output doesn't exist
                                    return(null);
                                }
                                else
                                {
                                    // Recurse on previous arc:
                                    arc.CopyFrom(prevArc);
                                    result.Ints[upto++] = arc.Label;
                                    output += arc.Output.Value;
                                    //System.out.println("    recurse prev label=" + (char) arc.label + " output=" + output);
                                    break;
                                }
                            }
                            else if (arc.Last)
                            {
                                // Recurse on this arc:
                                output = minArcOutput;
                                //System.out.println("    recurse last label=" + (char) arc.label + " output=" + output);
                                result.Ints[upto++] = arc.Label;
                                break;
                            }
                            else
                            {
                                // Read next arc in this node:
                                prevArc = scratchArc;
                                prevArc.CopyFrom(arc);
                                //System.out.println("      after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
                                fst.ReadNextRealArc(arc, @in);
                            }
                        }
                    }
                }
                else
                {
                    //System.out.println("  no target arcs; not found!");
                    return(null);
                }
            }
        }
Ejemplo n.º 12
0
        private void ProcessFacetFields(TaxonomyWriter taxoWriter, IDictionary <string, IList <FacetField> > byField, Document doc)
        {
            foreach (KeyValuePair <string, IList <FacetField> > ent in byField)
            {
                string indexFieldName = ent.Key;
                //System.out.println("  indexFieldName=" + indexFieldName + " fields=" + ent.getValue());

                IntsRef ordinals = new IntsRef(32);
                foreach (FacetField facetField in ent.Value)
                {
                    FacetsConfig.DimConfig ft = GetDimConfig(facetField.dim);
                    if (facetField.path.Length > 1 && ft.Hierarchical == false)
                    {
                        throw new System.ArgumentException("dimension \"" + facetField.dim + "\" is not hierarchical yet has " + facetField.path.Length + " components");
                    }

                    FacetLabel cp = new FacetLabel(facetField.dim, facetField.path);

                    checkTaxoWriter(taxoWriter);
                    int ordinal = taxoWriter.AddCategory(cp);
                    if (ordinals.Length == ordinals.Ints.Length)
                    {
                        ordinals.Grow(ordinals.Length + 1);
                    }
                    ordinals.Ints[ordinals.Length++] = ordinal;
                    //System.out.println("ords[" + (ordinals.length-1) + "]=" + ordinal);
                    //System.out.println("  add cp=" + cp);

                    if (ft.MultiValued && (ft.Hierarchical || ft.RequireDimCount))
                    {
                        //System.out.println("  add parents");
                        // Add all parents too:
                        int parent = taxoWriter.GetParent(ordinal);
                        while (parent > 0)
                        {
                            if (ordinals.Ints.Length == ordinals.Length)
                            {
                                ordinals.Grow(ordinals.Length + 1);
                            }
                            ordinals.Ints[ordinals.Length++] = parent;
                            parent = taxoWriter.GetParent(parent);
                        }

                        if (ft.RequireDimCount == false)
                        {
                            // Remove last (dimension) ord:
                            ordinals.Length--;
                        }
                    }

                    // Drill down:
                    for (int i = 1; i <= cp.Length; i++)
                    {
                        doc.Add(new StringField(indexFieldName, PathToString(cp.Components, i), Field.Store.NO));
                    }
                }

                // Facet counts:
                // DocValues are considered stored fields:
                doc.Add(new BinaryDocValuesField(indexFieldName, DedupAndEncode(ordinals)));
            }
        }