Example #1
0
        private long?LookupPrefix(BytesRef scratch, FST.Arc <long?> arc) //Bogus
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(0 == (long)fst.Outputs.NoOutput);
            }
            long output      = 0;
            var  bytesReader = fst.GetBytesReader();

            fst.GetFirstArc(arc);

            byte[] bytes = scratch.Bytes;
            int    pos   = scratch.Offset;
            int    end   = pos + scratch.Length;

            while (pos < end)
            {
                if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null)
                {
                    return(null);
                }
                else
                {
                    output += (long)arc.Output;
                }
            }

            return(output);
        }
Example #2
0
        /// <summary>
        /// Looks up the output for this input, or null if the
        ///  input is not accepted.
        /// </summary>
        public static T Get <T>(FST <T> fst, IntsRef input)
        {
            // TODO: would be nice not to alloc this on every lookup
            var arc = fst.GetFirstArc(new FST <T> .Arc <T>());

            var fstReader = fst.BytesReader;

            // Accumulate output as we go
            T output = fst.Outputs.NoOutput;

            for (int i = 0; i < input.Length; i++)
            {
                if (fst.FindTargetArc(input.Ints[input.Offset + i], arc, arc, fstReader) == null)
                {
                    return(default(T));
                }
                output = fst.Outputs.Add(output, arc.Output);
            }

            if (arc.Final)
            {
                return(fst.Outputs.Add(output, arc.NextFinalOutput));
            }
            else
            {
                return(default(T));
            }
        }
Example #3
0
        // TODO: maybe a CharsRef version for BYTE2

        /// <summary>
        /// Looks up the output for this input, or <c>null</c> if the
        /// input is not accepted
        /// </summary>
        public static T Get <T>(FST <T> fst, BytesRef input)
        {
            Debug.Assert(fst.InputType == FST.INPUT_TYPE.BYTE1);

            var fstReader = fst.GetBytesReader();

            // TODO: would be nice not to alloc this on every lookup
            var arc = fst.GetFirstArc(new FST.Arc <T>());

            // Accumulate output as we go
            T output = fst.Outputs.NoOutput;

            for (int i = 0; i < input.Length; i++)
            {
                if (fst.FindTargetArc(input.Bytes[i + input.Offset] & 0xFF, arc, arc, fstReader) == null)
                {
                    return(default(T));
                }
                output = fst.Outputs.Add(output, arc.Output);
            }

            if (arc.IsFinal)
            {
                return(fst.Outputs.Add(output, arc.NextFinalOutput));
            }
            else
            {
                return(default(T));
            }
        }
Example #4
0
        // runs the term, returning the output, or null if term
        // isn't accepted.  if prefixLength is non-null it must be
        // length 1 int array; prefixLength[0] is set to the length
        // of the term prefix that matches
        private static T Run(FST <T> fst, Int32sRef term, int[] prefixLength) // LUCENENET: CA1822: Mark members as static
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(prefixLength == null || prefixLength.Length == 1);
            }
            FST.Arc <T> arc       = fst.GetFirstArc(new FST.Arc <T>());
            T           NO_OUTPUT = fst.Outputs.NoOutput;
            T           output    = NO_OUTPUT;

            FST.BytesReader fstReader = fst.GetBytesReader();

            for (int i = 0; i <= term.Length; i++)
            {
                int label;
                if (i == term.Length)
                {
                    label = FST.END_LABEL;
                }
                else
                {
                    label = term.Int32s[term.Offset + i];
                }
                // System.out.println("   loop i=" + i + " label=" + label + " output=" + fst.Outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
                if (fst.FindTargetArc(label, arc, arc, fstReader) == null)
                {
                    // System.out.println("    not found");
                    if (prefixLength != null)
                    {
                        prefixLength[0] = i;
                        return(output);
                    }
                    else
                    {
                        return(default);
Example #5
0
        // TODO: this is pretty stupid, considering how the stemming algorithm works
        // we can speed it up to be significantly faster!
        internal virtual IntsRef Lookup(FST <IntsRef> fst, char[] word, int offset, int length)
        {
            if (fst == null)
            {
                return(null);
            }
            FST.BytesReader   bytesReader = fst.BytesReader;
            FST.Arc <IntsRef> arc         = fst.GetFirstArc(new FST.Arc <IntsRef>());
            // Accumulate output as we go
            IntsRef NO_OUTPUT = fst.Outputs.NoOutput;
            IntsRef output    = NO_OUTPUT;

            int l = offset + length;

            try
            {
                for (int i = offset, cp = 0; i < l; i += Character.CharCount(cp))
                {
                    cp = Character.CodePointAt(word, i, l);
                    if (fst.FindTargetArc(cp, arc, arc, bytesReader) == null)
                    {
                        return(null);
                    }
                    else if (arc.Output != NO_OUTPUT)
                    {
                        output = fst.Outputs.Add(output, arc.Output);
                    }
                }
                if (fst.FindTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null)
                {
                    return(null);
                }
                else if (arc.Output != NO_OUTPUT)
                {
                    return(fst.Outputs.Add(output, arc.Output));
                }
                else
                {
                    return(output);
                }
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus.Message, bogus);
            }
        }
Example #6
0
 private FST.Arc <long?>[] CacheRootArcs()
 {
     FST.Arc <long?>[] rootCache = new FST.Arc <long?> [1 + (cacheCeiling - 0x3040)];
     FST.Arc <long?>   firstArc  = new FST.Arc <long?>();
     fst.GetFirstArc(firstArc);
     FST.Arc <long?> arc       = new FST.Arc <long?>();
     FST.BytesReader fstReader = fst.GetBytesReader();
     // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
     for (int i = 0; i < rootCache.Length; i++)
     {
         if (fst.FindTargetArc(0x3040 + i, firstArc, arc, fstReader) != null)
         {
             rootCache[i] = new FST.Arc <long?>().CopyFrom(arc);
         }
     }
     return(rootCache);
 }
Example #7
0
        /// <summary>
        /// Looks up the output for this input, or <c>null</c> if the
        /// input is not accepted.
        /// </summary>
        public static T Get <T>(FST <T> fst, Int32sRef input) where T : class // LUCENENET specific - added class constraint, since we compare reference equality
        {
            // TODO: would be nice not to alloc this on every lookup
            var arc = fst.GetFirstArc(new FST.Arc <T>());

            var fstReader = fst.GetBytesReader();

            // Accumulate output as we go
            T output = fst.Outputs.NoOutput;

            for (int i = 0; i < input.Length; i++)
            {
                if (fst.FindTargetArc(input.Int32s[input.Offset + i], arc, arc, fstReader) is null)
                {
                    return(default);
Example #8
0
        /// <summary>
        /// Descend along the path starting at <paramref name="arc"/> and going through bytes
        /// in the argument.
        /// </summary>
        /// <param name="arc">
        ///          The starting arc. This argument is modified in-place. </param>
        /// <param name="utf8">
        ///          The term to descend along. </param>
        /// <returns> If <c>true</c>, <paramref name="arc"/> will be set to the arc
        ///         matching last byte of <paramref name="term"/>. <c>false</c> is
        ///         returned if no such prefix exists. </returns>
        private bool DescendWithPrefix(FST.Arc <object> arc, BytesRef utf8)
        {
            int max = utf8.Offset + utf8.Length;

            // Cannot save as instance var since multiple threads
            // can use FSTCompletion at once...
            FST.BytesReader fstReader = automaton.BytesReader;
            for (int i = utf8.Offset; i < max; i++)
            {
                if (automaton.FindTargetArc(utf8.Bytes[i] & 0xff, arc, arc, fstReader) == null)
                {
                    // No matching prefixes, return an empty result.
                    return(false);
                }
            }
            return(true);
        }
Example #9
0
        /// <summary>
        /// Seeks to exactly target term. </summary>
        protected virtual bool DoSeekExact()
        {
            // TODO: possibly caller could/should provide common
            // prefix length?  ie this work may be redundant if
            // caller is in fact intersecting against its own
            // automaton

            //System.out.println("FE: seek exact upto=" + upto);

            // Save time by starting at the end of the shared prefix
            // b/w our current term & the target:
            RewindPrefix();

            //System.out.println("FE: after rewind upto=" + upto);
            FST.Arc <T> arc         = GetArc(m_upto - 1);
            int         targetLabel = TargetLabel;

            FST.BytesReader fstReader = m_fst.GetBytesReader();

            while (true)
            {
                //System.out.println("  cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel));
                FST.Arc <T> nextArc = m_fst.FindTargetArc(targetLabel, arc, GetArc(m_upto), fstReader);
                if (nextArc == null)
                {
                    // short circuit
                    //upto--;
                    //upto = 0;
                    m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), fstReader);
                    //System.out.println("  no match upto=" + upto);
                    return(false);
                }
                // Match -- recurse:
                m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], nextArc.Output);
                if (targetLabel == FST.END_LABEL)
                {
                    //System.out.println("  return found; upto=" + upto + " output=" + output[upto] + " nextArc=" + nextArc.isLast());
                    return(true);
                }
                CurrentLabel = targetLabel;
                Incr();
                targetLabel = TargetLabel;
                arc         = nextArc;
            }
        }
Example #10
0
        // TODO: this could be more efficient!
        internal static void ApplyMappings(FST <CharsRef> fst, StringBuilder sb)
        {
            FST.BytesReader    bytesReader = fst.BytesReader;
            FST.Arc <CharsRef> firstArc    = fst.GetFirstArc(new FST.Arc <CharsRef>());
            CharsRef           NO_OUTPUT   = fst.Outputs.NoOutput;

            // temporary stuff
            FST.Arc <CharsRef> arc = new FST.Arc <CharsRef>();
            int      longestMatch;
            CharsRef longestOutput;

            for (int i = 0; i < sb.Length; i++)
            {
                arc.CopyFrom(firstArc);
                CharsRef output = NO_OUTPUT;
                longestMatch  = -1;
                longestOutput = null;

                for (int j = i; j < sb.Length; j++)
                {
                    char ch = sb[j];
                    if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null)
                    {
                        break;
                    }
                    else
                    {
                        output = fst.Outputs.Add(output, arc.Output);
                    }
                    if (arc.IsFinal)
                    {
                        longestOutput = fst.Outputs.Add(output, arc.NextFinalOutput);
                        longestMatch  = j;
                    }
                }

                if (longestMatch >= 0)
                {
                    sb.Remove(i, longestMatch + 1 - i);
                    sb.Insert(i, longestOutput);
                    i += (longestOutput.Length - 1);
                }
            }
        }
Example #11
0
        // runs the term, returning the output, or null if term
        // isn't accepted.  if prefixLength is non-null it must be
        // length 1 int array; prefixLength[0] is set to the length
        // of the term prefix that matches
        private T Run(FST <T> fst, IntsRef term, int[] prefixLength)
        {
            Debug.Assert(prefixLength == null || prefixLength.Length == 1);
            FST <T> .Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>());
            T NO_OUTPUT          = fst.Outputs.NoOutput;
            T output             = NO_OUTPUT;

            FST.BytesReader fstReader = fst.BytesReader;

            for (int i = 0; i <= term.Length; i++)
            {
                int label;
                if (i == term.Length)
                {
                    label = FST <T> .END_LABEL;
                }
                else
                {
                    label = term.Ints[term.Offset + i];
                }
                // System.out.println("   loop i=" + i + " label=" + label + " output=" + fst.Outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
                if (fst.FindTargetArc(label, arc, arc, fstReader) == null)
                {
                    // System.out.println("    not found");
                    if (prefixLength != null)
                    {
                        prefixLength[0] = i;
                        return(output);
                    }
                    else
                    {
                        return(default(T));
                    }
                }
                output = fst.Outputs.Add(output, arc.Output);
            }

            if (prefixLength != null)
            {
                prefixLength[0] = term.Length;
            }

            return(output);
        }
Example #12
0
            /// <summary>
            /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
            /// </summary>
            public BytesRef Get(char[] buffer, int bufferLen, FST.Arc <BytesRef> scratchArc, FST.BytesReader fstReader)
            {
                BytesRef pendingOutput = fst.Outputs.NoOutput;
                BytesRef matchOutput   = null;
                int      bufUpto       = 0;

                fst.GetFirstArc(scratchArc);
                while (bufUpto < bufferLen)
                {
                    int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
                    {
                        return(null);
                    }
                    pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                    bufUpto      += Character.CharCount(codePoint);
                }
                if (scratchArc.IsFinal)
                {
                    matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
                }
                return(matchOutput);
            }
Example #13
0
        // NOTE: copied from WFSTCompletionLookup & tweaked
        private long?LookupPrefix(FST <long?> fst, FST.BytesReader bytesReader, BytesRef scratch, FST.Arc <long?> arc)
        {
            long?output = fst.Outputs.NoOutput;

            fst.GetFirstArc(arc);

            var bytes = scratch.Bytes;
            var pos   = scratch.Offset;
            var end   = pos + scratch.Length;

            while (pos < end)
            {
                if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null)
                {
                    return(null);
                }
                else
                {
                    output = fst.Outputs.Add(output, arc.Output);
                }
            }

            return(output);
        }
Example #14
0
        public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(num > 0);
            }

            if (onlyMorePopular)
            {
                throw new ArgumentException("this suggester only works with onlyMorePopular=false");
            }
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            if (fst == null)
            {
                return(Collections.EmptyList <LookupResult>());
            }

            //System.out.println("lookup key=" + key + " num=" + num);
            for (var i = 0; i < key.Length; i++)
            {
                if (key[i] == 0x1E)
                {
                    throw new ArgumentException(
                              "lookup key cannot contain HOLE character U+001E; this character is reserved");
                }
                if (key[i] == 0x1F)
                {
                    throw new ArgumentException(
                              "lookup key cannot contain unit separator character U+001F; this character is reserved");
                }
            }

            var utf8Key = new BytesRef(key);

            try
            {
                Automaton lookupAutomaton = ToLookupAutomaton(key);

                var spare = new CharsRef();

                //System.out.println("  now intersect exactFirst=" + exactFirst);

                // Intersect automaton w/ suggest wFST and get all
                // prefix starting nodes & their outputs:
                //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

                //System.out.println("  prefixPaths: " + prefixPaths.size());

                FST.BytesReader bytesReader = fst.GetBytesReader();

                var scratchArc = new FST.Arc <PairOutputs <long?, BytesRef> .Pair>();

                IList <LookupResult> results = new List <LookupResult>();

                IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths =
                    FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst);

                if (exactFirst)
                {
                    int count = 0;
                    foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths)
                    {
                        if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                        {
                            // This node has END_BYTE arc leaving, meaning it's an
                            // "exact" match:
                            count++;
                        }
                    }

                    // Searcher just to find the single exact only
                    // match, if present:
                    Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher;
                    searcher = new Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm,
                                                                                                    count * maxSurfaceFormsPerAnalyzedForm, weightComparer);

                    // NOTE: we could almost get away with only using
                    // the first start node.  The only catch is if
                    // maxSurfaceFormsPerAnalyzedForm had kicked in and
                    // pruned our exact match from one of these nodes
                    // ...:
                    foreach (var path in prefixPaths)
                    {
                        if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                        {
                            // This node has END_BYTE arc leaving, meaning it's an
                            // "exact" match:
                            searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false,
                                                   path.Input);
                        }
                    }

                    var completions = searcher.Search();
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(completions.IsComplete);
                    }

                    // NOTE: this is rather inefficient: we enumerate
                    // every matching "exactly the same analyzed form"
                    // path, and then do linear scan to see if one of
                    // these exactly matches the input.  It should be
                    // possible (though hairy) to do something similar
                    // to getByOutput, since the surface form is encoded
                    // into the FST output, so we more efficiently hone
                    // in on the exact surface-form match.  Still, I
                    // suspect very little time is spent in this linear
                    // seach: it's bounded by how many prefix start
                    // nodes we have and the
                    // maxSurfaceFormsPerAnalyzedForm:
                    foreach (var completion in completions)
                    {
                        BytesRef output2 = completion.Output.Output2;
                        if (SameSurfaceForm(utf8Key, output2))
                        {
                            results.Add(GetLookupResult(completion.Output.Output1, output2, spare));
                            break;
                        }
                    }

                    if (results.Count == num)
                    {
                        // That was quick:
                        return(results);
                    }
                }

                Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher2;
                searcher2 = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count,
                                                                      num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results);

                prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst);

                foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths)
                {
                    searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input);
                }

                var completions2 = searcher2.Search();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(completions2.IsComplete);
                }

                foreach (Util.Fst.Util.Result <PairOutputs <long?, BytesRef> .Pair> completion in completions2)
                {
                    LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare);

                    // TODO: for fuzzy case would be nice to return
                    // how many edits were required

                    //System.out.println("    result=" + result);
                    results.Add(result);

                    if (results.Count == num)
                    {
                        // In the exactFirst=true case the search may
                        // produce one extra path
                        break;
                    }
                }

                return(results);
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus.ToString(), bogus);
            }
        }
Example #15
0
        public override int Read()
        {
            //System.out.println("\nread");
            while (true)
            {
                if (replacement != null && replacementPointer < replacement.Length)
                {
                    //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
                    return(replacement.Chars[replacement.Offset + replacementPointer++]);
                }

                // TODO: a more efficient approach would be Aho/Corasick's
                // algorithm
                // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
                // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
                //
                // I think this would be (almost?) equivalent to 1) adding
                // epsilon arcs from all final nodes back to the init
                // node in the FST, 2) adding a .* (skip any char)
                // loop on the initial node, and 3) determinizing
                // that.  Then we would not have to Restart matching
                // at each position.

                int      lastMatchLen = -1;
                CharsRef lastMatch    = null;

                int firstCH = buffer.Get(inputOff);
                if (firstCH != -1)
                {
                    // LUCENENET fix: Check the dictionary to ensure it contains a key before reading it.
                    char key = Convert.ToChar((char)firstCH);
                    if (cachedRootArcs.TryGetValue(key, out FST.Arc <CharsRef> arc) && arc != null)
                    {
                        if (!FST.TargetHasArcs(arc))
                        {
                            // Fast pass for single character match:
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(arc.IsFinal);
                            }
                            lastMatchLen = 1;
                            lastMatch    = arc.Output;
                        }
                        else
                        {
                            int      lookahead = 0;
                            CharsRef output    = arc.Output;
                            while (true)
                            {
                                lookahead++;

                                if (arc.IsFinal)
                                {
                                    // Match! (to node is final)
                                    lastMatchLen = lookahead;
                                    lastMatch    = outputs.Add(output, arc.NextFinalOutput);
                                    // Greedy: keep searching to see if there's a
                                    // longer match...
                                }

                                if (!FST.TargetHasArcs(arc))
                                {
                                    break;
                                }

                                int ch = buffer.Get(inputOff + lookahead);
                                if (ch == -1)
                                {
                                    break;
                                }
                                if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) == null)
                                {
                                    // Dead end
                                    break;
                                }
                                output = outputs.Add(output, arc.Output);
                            }
                        }
                    }
                }

                if (lastMatch != null)
                {
                    inputOff += lastMatchLen;
                    //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);
                    int diff = lastMatchLen - lastMatch.Length;

                    if (diff != 0)
                    {
                        int prevCumulativeDiff = LastCumulativeDiff;
                        if (diff > 0)
                        {
                            // Replacement is shorter than matched input:
                            AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
                        }
                        else
                        {
                            // Replacement is longer than matched input: remap
                            // the "extra" chars all back to the same input
                            // offset:
                            int outputStart = inputOff - prevCumulativeDiff;
                            for (int extraIDX = 0; extraIDX < -diff; extraIDX++)
                            {
                                AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
                            }
                        }
                    }

                    replacement        = lastMatch;
                    replacementPointer = 0;
                }
                else
                {
                    int ret = buffer.Get(inputOff);
                    if (ret != -1)
                    {
                        inputOff++;
                        buffer.FreeBefore(inputOff);
                    }
                    return(ret);
                }
            }
        }
Example #16
0
        /// <summary>
        /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>,
        /// accumulating the <see cref="FST"/> end node and output for each path.
        /// </summary>
        public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(a.IsDeterministic);
            }
            IList <Path <T> > queue    = new List <Path <T> >();
            List <Path <T> >  endNodes = new List <Path <T> >();

            queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef()));

            FST.Arc <T>     scratchArc = new FST.Arc <T>();
            FST.BytesReader fstReader  = fst.GetBytesReader();

            while (queue.Count != 0)
            {
                Path <T> path = queue[queue.Count - 1];
                queue.Remove(path);
                if (path.State.Accept)
                {
                    endNodes.Add(path);
                    // we can stop here if we accept this path,
                    // we accept all further paths too
                    continue;
                }

                Int32sRef currentInput = path.Input;
                foreach (Transition t in path.State.GetTransitions())
                {
                    int min = t.Min;
                    int max = t.Max;
                    if (min == max)
                    {
                        FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader);
                        if (nextArc != null)
                        {
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = t.Min;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                        }
                    }
                    else
                    {
                        // TODO: if this transition's TO state is accepting, and
                        // it accepts the entire range possible in the FST (ie. 0 to 255),
                        // we can simply use the prefix as the accepted state instead of
                        // looking up all the ranges and terminate early
                        // here.  This just shifts the work from one queue
                        // (this one) to another (the completion search
                        // done in AnalyzingSuggester).

                        FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader);
                        while (nextArc != null && nextArc.Label <= max)
                        {
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(nextArc.Label <= max);
                            }
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(nextArc.Label >= min, () => nextArc.Label + " " + min);
                            }
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = nextArc.Label;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                            int label = nextArc.Label; // used in assert
                            nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(nextArc == null || label < nextArc.Label, () => "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString()));
                            }
                        }
                    }
                }
            }
            return(endNodes);
        }
Example #17
0
        private void Parse()
        {
            //System.out.println("\nS: parse");

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(inputSkipCount == 0);
            }

            int curNextRead = nextRead;

            // Holds the longest match we've seen so far:
            BytesRef matchOutput      = null;
            int      matchInputLength = 0;
            int      matchEndOffset   = -1;

            BytesRef pendingOutput = fst.Outputs.NoOutput;

            fst.GetFirstArc(scratchArc);

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(scratchArc.Output == fst.Outputs.NoOutput);
            }

            int tokenCount = 0;

            while (true)
            {
                // Pull next token's chars:
                char[] buffer;
                int    bufferLen;
                //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

                int inputEndOffset = 0;

                if (curNextRead == nextWrite)
                {
                    // We used up our lookahead buffer of input tokens
                    // -- pull next real input token:

                    if (finished)
                    {
                        break;
                    }
                    else
                    {
                        //System.out.println("  input.incrToken");
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(futureInputs[nextWrite].consumed);
                        }
                        // Not correct: a syn match whose output is longer
                        // than its input can set future inputs keepOrig
                        // to true:
                        //assert !futureInputs[nextWrite].keepOrig;
                        if (m_input.IncrementToken())
                        {
                            buffer    = termAtt.Buffer;
                            bufferLen = termAtt.Length;
                            PendingInput pendingInput = futureInputs[nextWrite];
                            lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset;
                            lastEndOffset   = pendingInput.endOffset = offsetAtt.EndOffset;
                            inputEndOffset  = pendingInput.endOffset;
                            //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                            if (nextRead != nextWrite)
                            {
                                Capture();
                            }
                            else
                            {
                                pendingInput.consumed = false;
                            }
                        }
                        else
                        {
                            // No more input tokens
                            //System.out.println("      set end");
                            finished = true;
                            break;
                        }
                    }
                }
                else
                {
                    // Still in our lookahead
                    buffer         = futureInputs[curNextRead].term.Chars;
                    bufferLen      = futureInputs[curNextRead].term.Length;
                    inputEndOffset = futureInputs[curNextRead].endOffset;
                    //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
                }

                tokenCount++;

                // Run each char in this token through the FST:
                int bufUpto = 0;
                while (bufUpto < bufferLen)
                {
                    int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
                    {
                        //System.out.println("    stop");
                        goto byTokenBreak;
                    }

                    // Accum the output
                    pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                    //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
                    bufUpto += Character.CharCount(codePoint);
                }

                // OK, entire token matched; now see if this is a final
                // state:
                if (scratchArc.IsFinal)
                {
                    matchOutput      = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
                    matchInputLength = tokenCount;
                    matchEndOffset   = inputEndOffset;
                    //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
                }

                // See if the FST wants to continue matching (ie, needs to
                // see the next input token):
                if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null)
                {
                    // No further rules can match here; we're done
                    // searching for matching rules starting at the
                    // current input position.
                    break;
                }
                else
                {
                    // More matching is possible -- accum the output (if
                    // any) of the WORD_SEP arc:
                    pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                    if (nextRead == nextWrite)
                    {
                        Capture();
                    }
                }

                curNextRead = RollIncr(curNextRead);
            }
byTokenBreak:

            if (nextRead == nextWrite && !finished)
            {
                //System.out.println("  skip write slot=" + nextWrite);
                nextWrite = RollIncr(nextWrite);
            }

            if (matchOutput != null)
            {
                //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
                inputSkipCount = matchInputLength;
                AddOutput(matchOutput, matchInputLength, matchEndOffset);
            }
            else if (nextRead != nextWrite)
            {
                // Even though we had no match here, we set to 1
                // because we need to skip current input token before
                // trying to match again:
                inputSkipCount = 1;
            }
            else
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(finished);
                }
            }

            //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
        }
Example #18
0
        /// <summary>
        /// Generates a list of stems for the provided word
        /// </summary>
        /// <param name="word"> Word to generate the stems for </param>
        /// <param name="length"> length </param>
        /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param>
        /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param>
        /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param>
        /// <param name="recursionDepth"> current recursiondepth </param>
        /// <param name="doPrefix"> true if we should remove prefixes </param>
        /// <param name="doSuffix"> true if we should remove suffixes </param>
        /// <param name="previousWasPrefix"> true if the previous removal was a prefix:
        ///        if we are removing a suffix, and it has no continuation requirements, its ok.
        ///        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param>
        /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix
        ///        this means inner most suffix must also contain circumfix flag. </param>
        /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param>
        /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns>
        private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant)
        {
            // TODO: allow this stuff to be reused by tokenfilter
            JCG.List <CharsRef> stems = new JCG.List <CharsRef>();

            if (doPrefix && dictionary.prefixes != null)
            {
                FST <Int32sRef>     fst         = dictionary.prefixes;
                Outputs <Int32sRef> outputs     = fst.Outputs;
                FST.BytesReader     bytesReader = prefixReaders[recursionDepth];
                FST.Arc <Int32sRef> arc         = prefixArcs[recursionDepth];
                fst.GetFirstArc(arc);
                Int32sRef NO_OUTPUT = outputs.NoOutput;
                Int32sRef output    = NO_OUTPUT;
                int       limit     = dictionary.fullStrip ? length : length - 1;
                for (int i = 0; i < limit; i++)
                {
                    if (i > 0)
                    {
                        int ch = word[i - 1];
                        if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null)
                        {
                            break;
                        }
                        else if (arc.Output != NO_OUTPUT)
                        {
                            output = fst.Outputs.Add(output, arc.Output);
                        }
                    }
                    Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment
                    if (!arc.IsFinal)
                    {
                        continue;
                    }
                    else
                    {
                        prefixes = fst.Outputs.Add(output, arc.NextFinalOutput);
                    }

                    for (int j = 0; j < prefixes.Length; j++)
                    {
                        int prefix = prefixes.Int32s[prefixes.Offset + j];
                        if (prefix == previous)
                        {
                            continue;
                        }
                        affixReader.Position = 8 * prefix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = condition.TripleShift(1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                        {
                            if (dictionary.onlyincompound == -1)
                            {
                                compatible = true;
                            }
                            else
                            {
                                // check if affix is allowed in a non-compound word
                                dictionary.flagLookup.Get(append, scratch);
                                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                                compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            }
                        }
                        else if (crossProduct)
                        {
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(prevFlag >= 0);
                            }
                            bool allowed = dictionary.onlyincompound == -1 ||
                                           !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false);
                        }
                        else
                        {
                            compatible = false;
                        }

                        if (compatible)
                        {
                            int deAffixedStart  = i;
                            int deAffixedLength = length - deAffixedStart;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength))
                            {
                                continue;
                            }

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                            Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant);

                            stems.AddRange(stemList);
                        }
                    }
                }
            }

            if (doSuffix && dictionary.suffixes != null)
            {
                FST <Int32sRef>     fst         = dictionary.suffixes;
                Outputs <Int32sRef> outputs     = fst.Outputs;
                FST.BytesReader     bytesReader = suffixReaders[recursionDepth];
                FST.Arc <Int32sRef> arc         = suffixArcs[recursionDepth];
                fst.GetFirstArc(arc);
                Int32sRef NO_OUTPUT = outputs.NoOutput;
                Int32sRef output    = NO_OUTPUT;
                int       limit     = dictionary.fullStrip ? 0 : 1;
                for (int i = length; i >= limit; i--)
                {
                    if (i < length)
                    {
                        int ch = word[i];
                        if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null)
                        {
                            break;
                        }
                        else if (arc.Output != NO_OUTPUT)
                        {
                            output = fst.Outputs.Add(output, arc.Output);
                        }
                    }
                    Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment
                    if (!arc.IsFinal)
                    {
                        continue;
                    }
                    else
                    {
                        suffixes = fst.Outputs.Add(output, arc.NextFinalOutput);
                    }

                    for (int j = 0; j < suffixes.Length; j++)
                    {
                        int suffix = suffixes.Int32s[suffixes.Offset + j];
                        if (suffix == previous)
                        {
                            continue;
                        }
                        affixReader.Position = 8 * suffix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = condition.TripleShift(1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                        {
                            if (dictionary.onlyincompound == -1)
                            {
                                compatible = true;
                            }
                            else
                            {
                                // check if affix is allowed in a non-compound word
                                dictionary.flagLookup.Get(append, scratch);
                                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                                compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            }
                        }
                        else if (crossProduct)
                        {
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(prevFlag >= 0);
                            }
                            bool allowed = dictionary.onlyincompound == -1 ||
                                           !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
                        }
                        else
                        {
                            compatible = false;
                        }

                        if (compatible)
                        {
                            int appendLength    = length - i;
                            int deAffixedLength = length - appendLength;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength))
                            {
                                continue;
                            }

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(word, 0, strippedWord, 0, deAffixedLength);
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);

                            stems.AddRange(stemList);
                        }
                    }
                }
            }

            return(stems);
        }