Exemplo n.º 1
0
        /// <summary>
        /// Looks up the output for this input, or <c>null</c> if the
        /// input is not accepted.
        /// </summary>
        public static T Get <T>(FST <T> fst, Int32sRef input)
        {
            // TODO: would be nice not to alloc this on every lookup
            var arc = fst.GetFirstArc(new FST.Arc <T>());

            var fstReader = fst.GetBytesReader();

            // Accumulate output as we go
            T output = fst.Outputs.NoOutput;

            for (int i = 0; i < input.Length; i++)
            {
                if (fst.FindTargetArc(input.Int32s[input.Offset + i], arc, arc, fstReader) == null)
                {
                    return(default);
Exemplo n.º 2
0
        /// <summary>
        /// Looks up the output for this input, or <c>null</c> if the
        /// input is not accepted.
        /// </summary>
        public static T Get <T>(FST <T> fst, Int32sRef input) where T : class // LUCENENET specific - added class constraint, since we compare reference equality
        {
            // TODO: would be nice not to alloc this on every lookup
            var arc = fst.GetFirstArc(new FST.Arc <T>());

            var fstReader = fst.GetBytesReader();

            // Accumulate output as we go
            T output = fst.Outputs.NoOutput;

            for (int i = 0; i < input.Length; i++)
            {
                if (fst.FindTargetArc(input.Int32s[input.Offset + i], arc, arc, fstReader) == null)
                {
                    return(default);
Exemplo n.º 3
0
        // runs the term, returning the output, or null if term
        // isn't accepted.  if prefixLength is non-null it must be
        // length 1 int array; prefixLength[0] is set to the length
        // of the term prefix that matches
        private T Run(FST <T> fst, Int32sRef term, int[] prefixLength)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(prefixLength == null || prefixLength.Length == 1);
            }
            FST.Arc <T> arc       = fst.GetFirstArc(new FST.Arc <T>());
            T           NO_OUTPUT = fst.Outputs.NoOutput;
            T           output    = NO_OUTPUT;

            FST.BytesReader fstReader = fst.GetBytesReader();

            for (int i = 0; i <= term.Length; i++)
            {
                int label;
                if (i == term.Length)
                {
                    label = FST.END_LABEL;
                }
                else
                {
                    label = term.Int32s[term.Offset + i];
                }
                // System.out.println("   loop i=" + i + " label=" + label + " output=" + fst.Outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
                if (fst.FindTargetArc(label, arc, arc, fstReader) == null)
                {
                    // System.out.println("    not found");
                    if (prefixLength != null)
                    {
                        prefixLength[0] = i;
                        return(output);
                    }
                    else
                    {
                        return(default(T));
                    }
                }
                output = fst.Outputs.Add(output, arc.Output);
            }

            if (prefixLength != null)
            {
                prefixLength[0] = term.Length;
            }

            return(output);
        }
Exemplo n.º 4
0
        private T RandomAcceptedWord(FST <T> fst, Int32sRef @in)
        {
            FST.Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>());

            IList <FST.Arc <T> > arcs = new List <FST.Arc <T> >();

            @in.Length = 0;
            @in.Offset = 0;
            T NO_OUTPUT = fst.Outputs.NoOutput;
            T output    = NO_OUTPUT;

            FST.BytesReader fstReader = fst.GetBytesReader();

            while (true)
            {
                // read all arcs:
                fst.ReadFirstTargetArc(arc, arc, fstReader);
                arcs.Add((new FST.Arc <T>()).CopyFrom(arc));
                while (!arc.IsLast)
                {
                    fst.ReadNextArc(arc, fstReader);
                    arcs.Add((new FST.Arc <T>()).CopyFrom(arc));
                }

                // pick one
                arc = arcs[random.Next(arcs.Count)];
                arcs.Clear();

                // accumulate output
                output = fst.Outputs.Add(output, arc.Output);

                // append label
                if (arc.Label == FST.END_LABEL)
                {
                    break;
                }

                if (@in.Int32s.Length == @in.Length)
                {
                    @in.Grow(1 + @in.Length);
                }
                @in.Int32s[@in.Length++] = arc.Label;
            }

            return(output);
        }
Exemplo n.º 5
0
        /// <summary>
        /// Default constructor that takes a <see cref="TextReader"/>. </summary>
        public MappingCharFilter(NormalizeCharMap normMap, TextReader @in)
            : base(@in)
        {
            //LUCENENET support to reset the reader.
            _input = GetBufferedReader(@in);
            _input.Mark(BufferedCharFilter.DEFAULT_CHAR_BUFFER_SIZE);
            buffer.Reset(_input);
            //buffer.Reset(@in);

            map            = normMap.map;
            cachedRootArcs = normMap.cachedRootArcs;

            if (map != null)
            {
                fstReader = map.GetBytesReader();
            }
            else
            {
                fstReader = null;
            }
        }
Exemplo n.º 6
0
        public override SortedSetDocValues GetSortedSet(FieldInfo field)
        {
            FSTEntry entry = fsts[field.Number];

            if (entry.NumOrds == 0)
            {
                return(DocValues.EMPTY_SORTED_SET); // empty FST!
            }
            FST <long?> instance;

            UninterruptableMonitor.Enter(this);
            try
            {
                if (!fstInstances.TryGetValue(field.Number, out instance) || instance == null)
                {
                    data.Seek(entry.Offset);
                    instance = new FST <long?>(data, PositiveInt32Outputs.Singleton);
                    ramBytesUsed.AddAndGet(instance.GetSizeInBytes());
                    fstInstances[field.Number] = instance;
                }
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }
            BinaryDocValues docToOrds = GetBinary(field);
            FST <long?>     fst       = instance;

            // per-thread resources
            var @in         = fst.GetBytesReader();
            var firstArc    = new FST.Arc <long?>();
            var scratchArc  = new FST.Arc <long?>();
            var scratchInts = new Int32sRef();
            var fstEnum     = new BytesRefFSTEnum <long?>(fst);
            var @ref        = new BytesRef();
            var input       = new ByteArrayDataInput();

            return(new SortedSetDocValuesAnonymousClass(entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input));
        }
Exemplo n.º 7
0
        private static void Walk <T>(FST <T> fst) // LUCENENET NOTE: Not referenced anywhere
        {
            var queue = new List <FST.Arc <T> >();

            // Java version was BitSet(), but in .NET we don't have a zero contructor BitSet.
            // Couldn't find the default size in BitSet, so went with zero here.
            var seen     = new BitSet();
            var reader   = fst.GetBytesReader();
            var startArc = fst.GetFirstArc(new FST.Arc <T>());

            queue.Add(startArc);
            while (queue.Count > 0)
            {
                //FST.Arc<T> arc = queue.Remove(0);
                var arc = queue[0];
                queue.RemoveAt(0);

                long node = arc.Target;
                //System.out.println(arc);
                if (FST <T> .TargetHasArcs(arc) && !seen.Get((int)node))
                {
                    seen.Set((int)node);
                    fst.ReadFirstRealTargetArc(node, arc, reader);
                    while (true)
                    {
                        queue.Add((new FST.Arc <T>()).CopyFrom(arc));
                        if (arc.IsLast)
                        {
                            break;
                        }
                        else
                        {
                            fst.ReadNextRealArc(arc, reader);
                        }
                    }
                }
            }
        }
Exemplo n.º 8
0
 // Use the builder to create:
 private NormalizeCharMap(FST <CharsRef> map)
 {
     this.map = map;
     if (map != null)
     {
         try
         {
             // Pre-cache root arcs:
             var             scratchArc = new FST.Arc <CharsRef>();
             FST.BytesReader fstReader  = map.GetBytesReader();
             map.GetFirstArc(scratchArc);
             if (FST <CharsRef> .TargetHasArcs(scratchArc))
             {
                 map.ReadFirstRealTargetArc(scratchArc.Target, scratchArc, fstReader);
                 while (true)
                 {
                     if (Debugging.AssertsEnabled)
                     {
                         Debugging.Assert(scratchArc.Label != FST.END_LABEL);
                     }
                     cachedRootArcs[Convert.ToChar((char)scratchArc.Label)] = (new FST.Arc <CharsRef>()).CopyFrom(scratchArc);
                     if (scratchArc.IsLast)
                     {
                         break;
                     }
                     map.ReadNextRealArc(scratchArc, fstReader);
                 }
             }
             //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
         }
         catch (IOException ioe)
         {
             // Bogus FST IOExceptions!!  (will never happen)
             throw new Exception("Should never happen", ioe);
         }
     }
 }
Exemplo n.º 9
0
                internal IntersectTermsEnum(FSTTermsReader.TermsReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) : base(outerInstance)
                {
                    this.outerInstance = outerInstance;
                    //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
                    this.fst        = outerInstance.dict;
                    this.fstReader  = fst.GetBytesReader();
                    this.fstOutputs = outerInstance.dict.Outputs;
                    this.fsa        = compiled.RunAutomaton;
                    this.level      = -1;
                    this.stack      = new Frame[16];
                    for (int i = 0; i < stack.Length; i++)
                    {
                        this.stack[i] = new Frame(this);
                    }

                    Frame frame;

                    frame = LoadVirtualFrame(NewFrame());
                    this.level++;
                    frame = LoadFirstFrame(NewFrame());
                    PushFrame(frame);

                    this.meta     = null;
                    this.metaUpto = 1;
                    this.decoded  = false;
                    this.pending  = false;

                    if (startTerm == null)
                    {
                        pending = IsAccept(TopFrame());
                    }
                    else
                    {
                        DoSeekCeil(startTerm);
                        pending = !startTerm.Equals(term) && IsValid(TopFrame()) && IsAccept(TopFrame());
                    }
                }
Exemplo n.º 10
0
        /// <summary>
        /// Dumps an <see cref="FST{T}"/> to a GraphViz's <c>dot</c> language description
        /// for visualization. Example of use:
        ///
        /// <code>
        /// using (TextWriter sw = new StreamWriter(&quot;out.dot&quot;))
        /// {
        ///     Util.ToDot(fst, sw, true, true);
        /// }
        /// </code>
        ///
        /// and then, from command line:
        ///
        /// <code>
        /// dot -Tpng -o out.png out.dot
        /// </code>
        ///
        /// <para/>
        /// Note: larger FSTs (a few thousand nodes) won't even
        /// render, don't bother.  If the FST is &gt; 2.1 GB in size
        /// then this method will throw strange exceptions.
        /// <para/>
        /// See also <a href="http://www.graphviz.org/">http://www.graphviz.org/</a>.
        /// </summary>
        /// <param name="sameRank">
        ///          If <c>true</c>, the resulting <c>dot</c> file will try
        ///          to order states in layers of breadth-first traversal. This may
        ///          mess up arcs, but makes the output FST's structure a bit clearer.
        /// </param>
        /// <param name="labelStates">
        ///          If <c>true</c> states will have labels equal to their offsets in their
        ///          binary format. Expands the graph considerably.
        /// </param>
        public static void ToDot <T>(FST <T> fst, TextWriter @out, bool sameRank, bool labelStates)
        {
            const string expandedNodeColor = "blue";

            // this is the start arc in the automaton (from the epsilon state to the first state
            // with outgoing transitions.
            FST.Arc <T> startArc = fst.GetFirstArc(new FST.Arc <T>());

            // A queue of transitions to consider for the next level.
            IList <FST.Arc <T> > thisLevelQueue = new List <FST.Arc <T> >();

            // A queue of transitions to consider when processing the next level.
            IList <FST.Arc <T> > nextLevelQueue = new List <FST.Arc <T> >();

            nextLevelQueue.Add(startArc);
            //System.out.println("toDot: startArc: " + startArc);

            // A list of states on the same level (for ranking).
            IList <int?> sameLevelStates = new List <int?>();

            // A bitset of already seen states (target offset).
            BitArray seen = new BitArray(32);

            seen.SafeSet((int)startArc.Target, true);

            // Shape for states.
            const string stateShape      = "circle";
            const string finalStateShape = "doublecircle";

            // Emit DOT prologue.
            @out.Write("digraph FST {\n");
            @out.Write("  rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");

            if (!labelStates)
            {
                @out.Write("  node [shape=circle, width=.2, height=.2, style=filled]\n");
            }

            EmitDotState(@out, "initial", "point", "white", "");

            T   NO_OUTPUT = fst.Outputs.NoOutput;
            var r         = fst.GetBytesReader();

            // final FST.Arc<T> scratchArc = new FST.Arc<>();

            {
                string stateColor;
                if (fst.IsExpandedTarget(startArc, r))
                {
                    stateColor = expandedNodeColor;
                }
                else
                {
                    stateColor = null;
                }

                bool isFinal;
                T    finalOutput;
                if (startArc.IsFinal)
                {
                    isFinal     = true;
                    finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default(T) : startArc.NextFinalOutput;
                }
                else
                {
                    isFinal     = false;
                    finalOutput = default(T);
                }

                EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput));
            }

            @out.Write("  initial -> " + startArc.Target + "\n");

            int level = 0;

            while (nextLevelQueue.Count > 0)
            {
                // we could double buffer here, but it doesn't matter probably.
                //System.out.println("next level=" + level);
                thisLevelQueue.AddRange(nextLevelQueue);
                nextLevelQueue.Clear();

                level++;
                @out.Write("\n  // Transitions and states at level: " + level + "\n");
                while (thisLevelQueue.Count > 0)
                {
                    FST.Arc <T> arc = thisLevelQueue[thisLevelQueue.Count - 1];
                    thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1);
                    //System.out.println("  pop: " + arc);
                    if (FST <T> .TargetHasArcs(arc))
                    {
                        // scan all target arcs
                        //System.out.println("  readFirstTarget...");

                        long node = arc.Target;

                        fst.ReadFirstRealTargetArc(arc.Target, arc, r);

                        //System.out.println("    firstTarget: " + arc);

                        while (true)
                        {
                            //System.out.println("  cycle arc=" + arc);
                            // Emit the unseen state and add it to the queue for the next level.
                            if (arc.Target >= 0 && !seen.SafeGet((int)arc.Target))
                            {
                                /*
                                 * boolean isFinal = false;
                                 * T finalOutput = null;
                                 * fst.readFirstTargetArc(arc, scratchArc);
                                 * if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) {
                                 * // target is final
                                 * isFinal = true;
                                 * finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output;
                                 * System.out.println("dot hit final label=" + (char) scratchArc.label);
                                 * }
                                 */
                                string stateColor;
                                if (fst.IsExpandedTarget(arc, r))
                                {
                                    stateColor = expandedNodeColor;
                                }
                                else
                                {
                                    stateColor = null;
                                }

                                string finalOutput;
                                if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT))
                                {
                                    finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput);
                                }
                                else
                                {
                                    finalOutput = "";
                                }

                                EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput);
                                // To see the node address, use this instead:
                                //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
                                seen.SafeSet((int)arc.Target, true);
                                nextLevelQueue.Add((new FST.Arc <T>()).CopyFrom(arc));
                                sameLevelStates.Add((int)arc.Target);
                            }

                            string outs;
                            if (!arc.Output.Equals(NO_OUTPUT))
                            {
                                outs = "/" + fst.Outputs.OutputToString(arc.Output);
                            }
                            else
                            {
                                outs = "";
                            }

                            if (!FST <T> .TargetHasArcs(arc) && arc.IsFinal && !arc.NextFinalOutput.Equals(NO_OUTPUT))
                            {
                                // Tricky special case: sometimes, due to
                                // pruning, the builder can [sillily] produce
                                // an FST with an arc into the final end state
                                // (-1) but also with a next final output; in
                                // this case we pull that output up onto this
                                // arc
                                outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]";
                            }

                            string arcColor;
                            if (arc.Flag(FST.BIT_TARGET_NEXT))
                            {
                                arcColor = "red";
                            }
                            else
                            {
                                arcColor = "black";
                            }

                            Debug.Assert(arc.Label != FST.END_LABEL);
                            @out.Write("  " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.IsFinal ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n");

                            // Break the loop if we're on the last arc of this state.
                            if (arc.IsLast)
                            {
                                //System.out.println("    break");
                                break;
                            }
                            fst.ReadNextRealArc(arc, r);
                        }
                    }
                }

                // Emit state ranking information.
                if (sameRank && sameLevelStates.Count > 1)
                {
                    @out.Write("  {rank=same; ");
                    foreach (int state in sameLevelStates)
                    {
                        @out.Write(state + "; ");
                    }
                    @out.Write(" }\n");
                }
                sameLevelStates.Clear();
            }

            // Emit terminating state (always there anyway).
            @out.Write("  -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n");
            @out.Write("  {rank=sink; -1 }\n");

            @out.Write("}\n");
            @out.Flush();
        }
Exemplo n.º 11
0
            public virtual TopResults <T> Search()
            {
                IList <Result <T> > results = new List <Result <T> >();

                //System.out.println("search topN=" + topN);

                var fstReader = fst.GetBytesReader();
                T   NO_OUTPUT = fst.Outputs.NoOutput;

                // TODO: we could enable FST to sorting arcs by weight
                // as it freezes... can easily do this on first pass
                // (w/o requiring rewrite)

                // TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!?
                // (nibbles)
                int rejectCount = 0;

                // For each top N path:
                while (results.Count < topN)
                {
                    //System.out.println("\nfind next path: queue.size=" + queue.size());

                    FSTPath <T> path;

                    if (queue == null)
                    {
                        // Ran out of paths
                        //System.out.println("  break queue=null");
                        break;
                    }

                    // Remove top path since we are now going to
                    // pursue it:

                    // LUCENENET NOTE: SortedSet doesn't have atomic operations,
                    // so we need to add some thread safety just in case.
                    // Perhaps it might make sense to wrap SortedSet into a type
                    // that provides thread safety.
                    lock (syncLock)
                    {
                        path = queue.Min;
                        if (path != null)
                        {
                            queue.Remove(path);
                        }
                    }

                    if (path == null)
                    {
                        // There were less than topN paths available:
                        //System.out.println("  break no more paths");
                        break;
                    }

                    if (path.Arc.Label == FST.END_LABEL)
                    {
                        //System.out.println("    empty string!  cost=" + path.cost);
                        // Empty string!
                        path.Input.Length--;
                        results.Add(new Result <T>(path.Input, path.Cost));
                        continue;
                    }

                    if (results.Count == topN - 1 && maxQueueDepth == topN)
                    {
                        // Last path -- don't bother w/ queue anymore:
                        queue = null;
                    }

                    //System.out.println("  path: " + path);

                    // We take path and find its "0 output completion",
                    // ie, just keep traversing the first arc with
                    // NO_OUTPUT that we can find, since this must lead
                    // to the minimum path that completes from
                    // path.arc.

                    // For each input letter:
                    while (true)
                    {
                        //System.out.println("\n    cycle path: " + path);
                        fst.ReadFirstTargetArc(path.Arc, path.Arc, fstReader);

                        // For each arc leaving this node:
                        bool foundZero = false;
                        while (true)
                        {
                            //System.out.println("      arc=" + (char) path.arc.label + " cost=" + path.arc.output);
                            // tricky: instead of comparing output == 0, we must
                            // express it via the comparer compare(output, 0) == 0
                            if (comparer.Compare(NO_OUTPUT, path.Arc.Output) == 0)
                            {
                                if (queue == null)
                                {
                                    foundZero = true;
                                    break;
                                }
                                else if (!foundZero)
                                {
                                    scratchArc.CopyFrom(path.Arc);
                                    foundZero = true;
                                }
                                else
                                {
                                    AddIfCompetitive(path);
                                }
                            }
                            else if (queue != null)
                            {
                                AddIfCompetitive(path);
                            }
                            if (path.Arc.IsLast)
                            {
                                break;
                            }
                            fst.ReadNextArc(path.Arc, fstReader);
                        }

                        Debug.Assert(foundZero);

                        if (queue != null)
                        {
                            // TODO: maybe we can save this copyFrom if we
                            // are more clever above... eg on finding the
                            // first NO_OUTPUT arc we'd switch to using
                            // scratchArc
                            path.Arc.CopyFrom(scratchArc);
                        }

                        if (path.Arc.Label == FST.END_LABEL)
                        {
                            // Add final output:
                            //Debug.WriteLine("    done!: " + path);
                            T finalOutput = fst.Outputs.Add(path.Cost, path.Arc.Output);
                            if (AcceptResult(path.Input, finalOutput))
                            {
                                //Debug.WriteLine("    add result: " + path);
                                results.Add(new Result <T>(path.Input, finalOutput));
                            }
                            else
                            {
                                rejectCount++;
                            }
                            break;
                        }
                        else
                        {
                            path.Input.Grow(1 + path.Input.Length);
                            path.Input.Int32s[path.Input.Length] = path.Arc.Label;
                            path.Input.Length++;
                            path.Cost = fst.Outputs.Add(path.Cost, path.Arc.Output);
                        }
                    }
                }
                return(new TopResults <T>(rejectCount + topN <= maxQueueDepth, results));
            }
Exemplo n.º 12
0
        public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(num > 0);
            }

            if (onlyMorePopular)
            {
                throw new ArgumentException("this suggester only works with onlyMorePopular=false");
            }
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            if (fst == null)
            {
                return(Collections.EmptyList <LookupResult>());
            }

            //System.out.println("lookup key=" + key + " num=" + num);
            for (var i = 0; i < key.Length; i++)
            {
                if (key[i] == 0x1E)
                {
                    throw new ArgumentException(
                              "lookup key cannot contain HOLE character U+001E; this character is reserved");
                }
                if (key[i] == 0x1F)
                {
                    throw new ArgumentException(
                              "lookup key cannot contain unit separator character U+001F; this character is reserved");
                }
            }

            var utf8Key = new BytesRef(key);

            try
            {
                Automaton lookupAutomaton = ToLookupAutomaton(key);

                var spare = new CharsRef();

                //System.out.println("  now intersect exactFirst=" + exactFirst);

                // Intersect automaton w/ suggest wFST and get all
                // prefix starting nodes & their outputs:
                //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

                //System.out.println("  prefixPaths: " + prefixPaths.size());

                FST.BytesReader bytesReader = fst.GetBytesReader();

                var scratchArc = new FST.Arc <PairOutputs <long?, BytesRef> .Pair>();

                IList <LookupResult> results = new List <LookupResult>();

                IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths =
                    FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst);

                if (exactFirst)
                {
                    int count = 0;
                    foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths)
                    {
                        if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                        {
                            // This node has END_BYTE arc leaving, meaning it's an
                            // "exact" match:
                            count++;
                        }
                    }

                    // Searcher just to find the single exact only
                    // match, if present:
                    Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher;
                    searcher = new Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm,
                                                                                                    count * maxSurfaceFormsPerAnalyzedForm, weightComparer);

                    // NOTE: we could almost get away with only using
                    // the first start node.  The only catch is if
                    // maxSurfaceFormsPerAnalyzedForm had kicked in and
                    // pruned our exact match from one of these nodes
                    // ...:
                    foreach (var path in prefixPaths)
                    {
                        if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                        {
                            // This node has END_BYTE arc leaving, meaning it's an
                            // "exact" match:
                            searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false,
                                                   path.Input);
                        }
                    }

                    var completions = searcher.Search();
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(completions.IsComplete);
                    }

                    // NOTE: this is rather inefficient: we enumerate
                    // every matching "exactly the same analyzed form"
                    // path, and then do linear scan to see if one of
                    // these exactly matches the input.  It should be
                    // possible (though hairy) to do something similar
                    // to getByOutput, since the surface form is encoded
                    // into the FST output, so we more efficiently hone
                    // in on the exact surface-form match.  Still, I
                    // suspect very little time is spent in this linear
                    // seach: it's bounded by how many prefix start
                    // nodes we have and the
                    // maxSurfaceFormsPerAnalyzedForm:
                    foreach (var completion in completions)
                    {
                        BytesRef output2 = completion.Output.Output2;
                        if (SameSurfaceForm(utf8Key, output2))
                        {
                            results.Add(GetLookupResult(completion.Output.Output1, output2, spare));
                            break;
                        }
                    }

                    if (results.Count == num)
                    {
                        // That was quick:
                        return(results);
                    }
                }

                Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher2;
                searcher2 = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count,
                                                                      num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results);

                prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst);

                foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths)
                {
                    searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input);
                }

                var completions2 = searcher2.Search();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(completions2.IsComplete);
                }

                foreach (Util.Fst.Util.Result <PairOutputs <long?, BytesRef> .Pair> completion in completions2)
                {
                    LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare);

                    // TODO: for fuzzy case would be nice to return
                    // how many edits were required

                    //System.out.println("    result=" + result);
                    results.Add(result);

                    if (results.Count == num)
                    {
                        // In the exactFirst=true case the search may
                        // produce one extra path
                        break;
                    }
                }

                return(results);
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus.ToString(), bogus);
            }
        }
 internal FSTTermsEnum(FST <long?> fst)
 {
     this.fst    = fst;
     @in         = new BytesRefFSTEnum <long?>(fst);
     bytesReader = fst.GetBytesReader();
 }
Exemplo n.º 14
0
        // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND /
        // SEEK_END)?  saves the eq check above?

        /// <summary>
        /// Seeks to smallest term that's &gt;= target. </summary>
        protected virtual void DoSeekCeil()
        {
            //System.out.println("    advance len=" + target.length + " curlen=" + current.length);

            // TODO: possibly caller could/should provide common
            // prefix length?  ie this work may be redundant if
            // caller is in fact intersecting against its own
            // automaton

            //System.out.println("FE.seekCeil upto=" + upto);

            // Save time by starting at the end of the shared prefix
            // b/w our current term & the target:
            RewindPrefix();
            //System.out.println("  after rewind upto=" + upto);

            FST.Arc <T> arc         = GetArc(m_upto);
            int         targetLabel = TargetLabel;

            //System.out.println("  init targetLabel=" + targetLabel);

            // Now scan forward, matching the new suffix of the target
            while (true)
            {
                //System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel);

                if (arc.BytesPerArc != 0 && arc.Label != -1)
                {
                    // Arcs are fixed array -- use binary search to find
                    // the target.

                    FST.BytesReader @in  = m_fst.GetBytesReader();
                    int             low  = arc.ArcIdx;
                    int             high = arc.NumArcs - 1;
                    int             mid  = 0;
                    //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel);
                    bool found = false;
                    while (low <= high)
                    {
                        mid          = (int)((uint)(low + high) >> 1);
                        @in.Position = arc.PosArcsStart;
                        @in.SkipBytes(arc.BytesPerArc * mid + 1);
                        int midLabel = m_fst.ReadLabel(@in);
                        int cmp      = midLabel - targetLabel;
                        //System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
                        if (cmp < 0)
                        {
                            low = mid + 1;
                        }
                        else if (cmp > 0)
                        {
                            high = mid - 1;
                        }
                        else
                        {
                            found = true;
                            break;
                        }
                    }

                    // NOTE: this code is dup'd w/ the code below (in
                    // the outer else clause):
                    if (found)
                    {
                        // Match
                        arc.ArcIdx = mid - 1;
                        m_fst.ReadNextRealArc(arc, @in);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(arc.ArcIdx == mid);
                            Debugging.Assert(arc.Label == targetLabel, "arc.label={0} vs targetLabel={1} mid={2}", arc.Label, targetLabel, mid);
                        }
                        m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output);
                        if (targetLabel == FST.END_LABEL)
                        {
                            return;
                        }
                        CurrentLabel = arc.Label;
                        Incr();
                        arc         = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader);
                        targetLabel = TargetLabel;
                        continue;
                    }
                    else if (low == arc.NumArcs)
                    {
                        // Dead end
                        arc.ArcIdx = arc.NumArcs - 2;
                        m_fst.ReadNextRealArc(arc, @in);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(arc.IsLast);
                        }
                        // Dead end (target is after the last arc);
                        // rollback to last fork then push
                        m_upto--;
                        while (true)
                        {
                            if (m_upto == 0)
                            {
                                return;
                            }
                            FST.Arc <T> prevArc = GetArc(m_upto);
                            //System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast());
                            if (!prevArc.IsLast)
                            {
                                m_fst.ReadNextArc(prevArc, m_fstReader);
                                PushFirst();
                                return;
                            }
                            m_upto--;
                        }
                    }
                    else
                    {
                        arc.ArcIdx = (low > high ? low : high) - 1;
                        m_fst.ReadNextRealArc(arc, @in);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(arc.Label > targetLabel);
                        }
                        PushFirst();
                        return;
                    }
                }
                else
                {
                    // Arcs are not array'd -- must do linear scan:
                    if (arc.Label == targetLabel)
                    {
                        // recurse
                        m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output);
                        if (targetLabel == FST.END_LABEL)
                        {
                            return;
                        }
                        CurrentLabel = arc.Label;
                        Incr();
                        arc         = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader);
                        targetLabel = TargetLabel;
                    }
                    else if (arc.Label > targetLabel)
                    {
                        PushFirst();
                        return;
                    }
                    else if (arc.IsLast)
                    {
                        // Dead end (target is after the last arc);
                        // rollback to last fork then push
                        m_upto--;
                        while (true)
                        {
                            if (m_upto == 0)
                            {
                                return;
                            }
                            FST.Arc <T> prevArc = GetArc(m_upto);
                            //System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast());
                            if (!prevArc.IsLast)
                            {
                                m_fst.ReadNextArc(prevArc, m_fstReader);
                                PushFirst();
                                return;
                            }
                            m_upto--;
                        }
                    }
                    else
                    {
                        // keep scanning
                        //System.out.println("    next scan");
                        m_fst.ReadNextArc(arc, m_fstReader);
                    }
                }
            }
        }
Exemplo n.º 15
0
 internal FSTTermsEnum(FST <Int64> fst)
 {
     this.fst    = fst;
     input       = new BytesRefFSTEnum <Int64>(fst);
     bytesReader = fst.GetBytesReader();
 }
Exemplo n.º 16
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString());

            try
            {
                ITermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                IOffsetAttribute            offsetAtt    = ts.AddAttribute <IOffsetAttribute>();
                IPositionLengthAttribute    posLenAtt    = ts.AddAttribute <IPositionLengthAttribute>();
                IPositionIncrementAttribute posIncAtt    = ts.AddAttribute <IPositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset);

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.GetBytesReader();

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                List <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    try
                    {
                        prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(),
                                                               // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                                               // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                                               (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    backoff *= ALPHA;
                }

                results.Sort(new ComparerAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Exemplo n.º 17
0
        /// <summary>
        /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>,
        /// accumulating the <see cref="FST"/> end node and output for each path.
        /// </summary>
        public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(a.IsDeterministic);
            }
            IList <Path <T> > queue    = new List <Path <T> >();
            List <Path <T> >  endNodes = new List <Path <T> >();

            queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef()));

            FST.Arc <T>     scratchArc = new FST.Arc <T>();
            FST.BytesReader fstReader  = fst.GetBytesReader();

            while (queue.Count != 0)
            {
                Path <T> path = queue[queue.Count - 1];
                queue.Remove(path);
                if (path.State.Accept)
                {
                    endNodes.Add(path);
                    // we can stop here if we accept this path,
                    // we accept all further paths too
                    continue;
                }

                Int32sRef currentInput = path.Input;
                foreach (Transition t in path.State.GetTransitions())
                {
                    int min = t.Min;
                    int max = t.Max;
                    if (min == max)
                    {
                        FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader);
                        if (nextArc != null)
                        {
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = t.Min;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                        }
                    }
                    else
                    {
                        // TODO: if this transition's TO state is accepting, and
                        // it accepts the entire range possible in the FST (ie. 0 to 255),
                        // we can simply use the prefix as the accepted state instead of
                        // looking up all the ranges and terminate early
                        // here.  This just shifts the work from one queue
                        // (this one) to another (the completion search
                        // done in AnalyzingSuggester).

                        FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader);
                        while (nextArc != null && nextArc.Label <= max)
                        {
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(nextArc.Label <= max);
                            }
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(nextArc.Label >= min, () => nextArc.Label + " " + min);
                            }
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = nextArc.Label;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                            int label = nextArc.Label; // used in assert
                            nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(nextArc == null || label < nextArc.Label, () => "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString()));
                            }
                        }
                    }
                }
            }
            return(endNodes);
        }