// Use the builder to create:
 private NormalizeCharMap(FST<CharsRef> map)
 {
     this.map = map;
     if (map != null)
     {
         try
         {
             // Pre-cache root arcs:
             var scratchArc = new FST.Arc<CharsRef>();
             FST.BytesReader fstReader = map.BytesReader;
             map.GetFirstArc(scratchArc);
             if (FST<CharsRef>.TargetHasArcs(scratchArc))
             {
                 map.ReadFirstRealTargetArc(scratchArc.Target, scratchArc, fstReader);
                 while (true)
                 {
                     Debug.Assert(scratchArc.Label != FST.END_LABEL);
                     cachedRootArcs[Convert.ToChar((char)scratchArc.Label)] = (new FST.Arc<CharsRef>()).CopyFrom(scratchArc);
                     if (scratchArc.IsLast)
                     {
                         break;
                     }
                     map.ReadNextRealArc(scratchArc, fstReader);
                 }
             }
             //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
         }
         catch (IOException ioe)
         {
             // Bogus FST IOExceptions!!  (will never happen)
             throw new Exception("Should never happen", ioe);
         }
     }
 }
Esempio n. 2
0
	  // Use the builder to create:
	  private NormalizeCharMap(FST<CharsRef> map)
	  {
		this.map = map;
		if (map != null)
		{
		  try
		  {
			// Pre-cache root arcs:
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.CharsRef> scratchArc = new org.apache.lucene.util.fst.FST.Arc<>();
			FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader fstReader = map.getBytesReader();
			FST.BytesReader fstReader = map.BytesReader;
			map.getFirstArc(scratchArc);
			if (FST.targetHasArcs(scratchArc))
			{
			  map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
			  while (true)
			  {
				Debug.Assert(scratchArc.label != FST.END_LABEL);
				cachedRootArcs[Convert.ToChar((char) scratchArc.label)] = (new FST.Arc<CharsRef>()).copyFrom(scratchArc);
				if (scratchArc.Last)
				{
				  break;
				}
				map.readNextRealArc(scratchArc, fstReader);
			  }
			}
			//System.out.println("cached " + cachedRootArcs.size() + " root arcs");
		  }
		  catch (IOException ioe)
		  {
			// Bogus FST IOExceptions!!  (will never happen)
			throw new Exception(ioe);
		  }
		}
	  }
Esempio n. 3
0
 /// <summary>
 /// Sole constructor </summary>
 public FSTPath(T cost, FST.Arc <T> arc, Int32sRef input)
 {
     this.Arc   = (new FST.Arc <T>()).CopyFrom(arc);
     this.Cost  = cost;
     this.Input = input;
 }
Esempio n. 4
0
        // Uncomment for debugging:

        /*
         * public static <T> void dotToFile(FST<T> fst, String filePath) throws IOException {
         * Writer w = new OutputStreamWriter(new FileOutputStream(filePath));
         * toDot(fst, w, true, true);
         * w.Dispose();
         * }
         */

        /// <summary>
        /// Reads the first arc greater or equal that the given label into the provided
        /// arc in place and returns it iff found, otherwise return <c>null</c>.
        /// </summary>
        /// <param name="label"> the label to ceil on </param>
        /// <param name="fst"> the fst to operate on </param>
        /// <param name="follow"> the arc to follow reading the label from </param>
        /// <param name="arc"> the arc to read into in place </param>
        /// <param name="in"> the fst's <see cref="FST.BytesReader"/> </param>
        public static FST.Arc <T> ReadCeilArc <T>(int label, FST <T> fst, FST.Arc <T> follow, FST.Arc <T> arc, FST.BytesReader @in)
        {
            // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum?
            if (label == FST.END_LABEL)
            {
                if (follow.IsFinal)
                {
                    if (follow.Target <= 0)
                    {
                        arc.Flags = (sbyte)FST.BIT_LAST_ARC;
                    }
                    else
                    {
                        arc.Flags = 0;
                        // NOTE: nextArc is a node (not an address!) in this case:
                        arc.NextArc = follow.Target;
                        arc.Node    = follow.Target;
                    }
                    arc.Output = follow.NextFinalOutput;
                    arc.Label  = FST.END_LABEL;
                    return(arc);
                }
                else
                {
                    return(null);
                }
            }

            if (!FST <T> .TargetHasArcs(follow))
            {
                return(null);
            }
            fst.ReadFirstTargetArc(follow, arc, @in);
            if (arc.BytesPerArc != 0 && arc.Label != FST.END_LABEL)
            {
                // Arcs are fixed array -- use binary search to find
                // the target.

                int low  = arc.ArcIdx;
                int high = arc.NumArcs - 1;
                int mid  = 0;
                // System.out.println("do arc array low=" + low + " high=" + high +
                // " targetLabel=" + targetLabel);
                while (low <= high)
                {
                    mid          = (int)((uint)(low + high) >> 1);
                    @in.Position = arc.PosArcsStart;
                    @in.SkipBytes(arc.BytesPerArc * mid + 1);
                    int midLabel = fst.ReadLabel(@in);
                    int cmp      = midLabel - label;
                    // System.out.println("  cycle low=" + low + " high=" + high + " mid=" +
                    // mid + " midLabel=" + midLabel + " cmp=" + cmp);
                    if (cmp < 0)
                    {
                        low = mid + 1;
                    }
                    else if (cmp > 0)
                    {
                        high = mid - 1;
                    }
                    else
                    {
                        arc.ArcIdx = mid - 1;
                        return(fst.ReadNextRealArc(arc, @in));
                    }
                }
                if (low == arc.NumArcs)
                {
                    // DEAD END!
                    return(null);
                }

                arc.ArcIdx = (low > high ? high : low);
                return(fst.ReadNextRealArc(arc, @in));
            }

            // Linear scan
            fst.ReadFirstRealTargetArc(follow.Target, arc, @in);

            while (true)
            {
                // System.out.println("  non-bs cycle");
                // TODO: we should fix this code to not have to create
                // object for the output of every arc we scan... only
                // for the matching arc, if found
                if (arc.Label >= label)
                {
                    // System.out.println("    found!");
                    return(arc);
                }
                else if (arc.IsLast)
                {
                    return(null);
                }
                else
                {
                    fst.ReadNextRealArc(arc, @in);
                }
            }
        }
Esempio n. 5
0
 public SortedDocValuesAnonymousClass(FSTEntry fstEntry,
                                      NumericDocValues numericDocValues, FST <long?> fst1, FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc1,
                                      Int32sRef intsRef, BytesRefFSTEnum <long?> bytesRefFstEnum)
 {
     entry       = fstEntry;
     docToOrd    = numericDocValues;
     fst         = fst1;
     this.@in    = @in;
     firstArc    = arc;
     scratchArc  = scratchArc1;
     scratchInts = intsRef;
     fstEnum     = bytesRefFstEnum;
 }
Esempio n. 6
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.TokenStream("", key.ToString());

            try
            {
                TermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <TermToBytesRefAttribute>();
                OffsetAttribute            offsetAtt    = ts.AddAttribute <OffsetAttribute>();
                PositionLengthAttribute    posLenAtt    = ts.AddAttribute <PositionLengthAttribute>();
                PositionIncrementAttribute posIncAtt    = ts.AddAttribute <PositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset());
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.BytesReader;

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                IList <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    nextCompletionBreak :
                    backoff *= ALPHA;
                }

                results.Sort(new ComparatorAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(ts);
            }
        }
Esempio n. 7
0
        public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num)
        {
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            Debug.Assert(num > 0);

            if (onlyMorePopular)
            {
                throw new ArgumentException("this suggester only works with onlyMorePopular=false");
            }

            if (fst == null)
            {
                return(Collections.EmptyList <LookupResult>());
            }

            BytesRef scratch      = new BytesRef(key);
            int      prefixLength = scratch.Length;

            FST.Arc <long?> arc = new FST.Arc <long?>();

            // match the prefix portion exactly
            long?prefixOutput = null;

            try
            {
                prefixOutput = LookupPrefix(scratch, arc);
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus.ToString(), bogus);
            }

            if (prefixOutput == null)
            {
                return(Collections.EmptyList <LookupResult>());
            }

            List <LookupResult> results = new List <LookupResult>(num);
            CharsRef            spare   = new CharsRef();

            if (exactFirst && arc.IsFinal)
            {
                spare.Grow(scratch.Length);
                UnicodeUtil.UTF8toUTF16(scratch, spare);
                results.Add(new LookupResult(spare.ToString(), DecodeWeight(prefixOutput.GetValueOrDefault() + arc.NextFinalOutput.GetValueOrDefault())));
                if (--num == 0)
                {
                    return(results); // that was quick
                }
            }

            // complete top-N
            Util.Fst.Util.TopResults <long?> completions = null;
            try
            {
                completions = Lucene.Net.Util.Fst.Util.ShortestPaths(fst, arc, prefixOutput, weightComparer, num, !exactFirst);
                Debug.Assert(completions.IsComplete);
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus.ToString(), bogus);
            }

            BytesRef suffix = new BytesRef(8);

            foreach (Util.Fst.Util.Result <long?> completion in completions)
            {
                scratch.Length = prefixLength;
                // append suffix
                Lucene.Net.Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                scratch.Append(suffix);
                spare.Grow(scratch.Length);
                UnicodeUtil.UTF8toUTF16(scratch, spare);
                results.Add(new LookupResult(spare.ToString(), DecodeWeight(completion.Output.GetValueOrDefault())));
            }
            return(results);
        }
Esempio n. 8
0
 public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry entry, BinaryDocValues docToOrds, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum, BytesRef @ref, ByteArrayDataInput input)
 {
     this.entry       = entry;
     this.docToOrds   = docToOrds;
     this.fst         = fst;
     this.@in         = @in;
     this.firstArc    = firstArc;
     this.scratchArc  = scratchArc;
     this.scratchInts = scratchInts;
     this.fstEnum     = fstEnum;
     this.@ref        = @ref;
     this.input       = input;
 }
Esempio n. 9
0
        /// <param name="input"> input tokenstream </param>
        /// <param name="synonyms"> synonym map </param>
        /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>.
        ///                   Note, if you set this to true, its your responsibility to lowercase
        ///                   the input entries when you create the <seealso cref="SynonymMap"/> </param>
        public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = AddAttribute<IPositionLengthAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            this.synonyms = synonyms;
            this.ignoreCase = ignoreCase;
            this.fst = synonyms.fst;
            if (fst == null)
            {
                throw new System.ArgumentException("fst must be non-null");
            }
            this.fstReader = fst.BytesReader;

            // Must be 1+ so that when roll buffer is at full
            // lookahead we can distinguish this full buffer from
            // the empty buffer:
            rollBufferSize = 1 + synonyms.maxHorizontalContext;

            futureInputs = new PendingInput[rollBufferSize];
            futureOutputs = new PendingOutputs[rollBufferSize];
            for (int pos = 0; pos < rollBufferSize; pos++)
            {
                futureInputs[pos] = new PendingInput();
                futureOutputs[pos] = new PendingOutputs();
            }

            //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

            scratchArc = new FST.Arc<BytesRef>();
        }
 public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST<long?> fst, FST.BytesReader @in, FST.Arc<long?> firstArc, FST.Arc<long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long?> fstEnum)
 {
     this.Entry = entry;
     this.DocToOrd = docToOrd;
     this.Fst = fst;
     this.@in = @in;
     this.FirstArc = firstArc;
     this.ScratchArc = scratchArc;
     this.ScratchInts = scratchInts;
     this.FstEnum = fstEnum;
 }
        public override SortedSetDocValues GetSortedSet(FieldInfo field)
        {
            FSTEntry entry = Fsts[field.Number];
            if (entry.NumOrds == 0)
            {
                return DocValues.EMPTY_SORTED_SET; // empty FST!
            }
            FST<long?> instance;
            lock (this)
            {
                if (!FstInstances.TryGetValue(field.Number, out instance))
                {
                    Data.Seek(entry.Offset);
                    instance = new FST<long?>((DataInput)Data, Lucene.Net.Util.Fst.PositiveIntOutputs.Singleton);
                    RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes());
                    FstInstances[field.Number] = instance;
                }
            }
            BinaryDocValues docToOrds = GetBinary(field);
            FST<long?> fst = instance;

            // per-thread resources
            var @in = fst.BytesReader;
            var firstArc = new FST.Arc<long?>();
            var scratchArc = new FST.Arc<long?>();
            var scratchInts = new IntsRef();
            var fstEnum = new BytesRefFSTEnum<long?>(fst);
            var @ref = new BytesRef();
            var input = new ByteArrayDataInput();
            return new SortedSetDocValuesAnonymousInnerClassHelper(entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input);
        }
        public override SortedDocValues GetSorted(FieldInfo field)
        {
            FSTEntry entry = Fsts[field.Number];
            FST<long?> instance;
            lock (this)
            {
                if (!FstInstances.TryGetValue(field.Number, out instance))
                {
                    Data.Seek(entry.Offset);
                    instance = new FST<long?>(Data, PositiveIntOutputs.Singleton);
                    RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes());
                    FstInstances[field.Number] = instance;
                }
            }
            var docToOrd = GetNumeric(field);
            var fst = instance;

            // per-thread resources
            var @in = fst.BytesReader;
            var firstArc = new FST.Arc<long?>();
            var scratchArc = new FST.Arc<long?>();
            var scratchInts = new IntsRef();
            var fstEnum = new BytesRefFSTEnum<long?>(fst);

            return new SortedDocValuesAnonymousInnerClassHelper(entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum);
        }
Esempio n. 13
0
 internal Frame()
 {
     this.arc   = new FST.Arc <Int64>();
     this.state = -1;
 }
 internal FST.Arc<BytesRef> GetArc(int ord)
 {
     if (ord >= Arcs.Length)
     {
         FST.Arc<BytesRef>[] next = new FST.Arc<BytesRef>[ArrayUtil.Oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
         Array.Copy(Arcs, 0, next, 0, Arcs.Length);
         for (int arcOrd = Arcs.Length; arcOrd < next.Length; arcOrd++)
         {
             next[arcOrd] = new FST.Arc<BytesRef>();
         }
         Arcs = next;
     }
     return Arcs[ord];
 }
Esempio n. 15
0
 internal Frame(FSTTermsReader.TermsReader.IntersectTermsEnum outerInstance)
 {
     this.outerInstance = outerInstance;
     this.fstArc        = new FST.Arc <FSTTermOutputs.TermData>();
     this.fsaState      = -1;
 }
Esempio n. 16
0
        // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND /
        // SEEK_END)?  saves the eq check above?
        /// <summary>
        /// Seeks to largest term that's &lt;= target. </summary>
        protected virtual void DoSeekFloor()
        {
            // TODO: possibly caller could/should provide common
            // prefix length?  ie this work may be redundant if
            // caller is in fact intersecting against its own
            // automaton
            //System.out.println("FE: seek floor upto=" + upto);

            // Save CPU by starting at the end of the shared prefix
            // b/w our current term & the target:
            RewindPrefix();

            //System.out.println("FE: after rewind upto=" + upto);

            FST.Arc <T> arc         = GetArc(m_upto);
            int         targetLabel = TargetLabel;

            //System.out.println("FE: init targetLabel=" + targetLabel);

            // Now scan forward, matching the new suffix of the target
            while (true)
            {
                //System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + arc.bytesPerArc);

                if (arc.BytesPerArc != 0 && arc.Label != FST.END_LABEL)
                {
                    // Arcs are fixed array -- use binary search to find
                    // the target.

                    FST.BytesReader @in  = m_fst.GetBytesReader();
                    int             low  = arc.ArcIdx;
                    int             high = arc.NumArcs - 1;
                    int             mid  = 0;
                    //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel);
                    bool found = false;
                    while (low <= high)
                    {
                        mid          = (low + high).TripleShift(1);
                        @in.Position = arc.PosArcsStart;
                        @in.SkipBytes(arc.BytesPerArc * mid + 1);
                        int midLabel = m_fst.ReadLabel(@in);
                        int cmp      = midLabel - targetLabel;
                        //System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
                        if (cmp < 0)
                        {
                            low = mid + 1;
                        }
                        else if (cmp > 0)
                        {
                            high = mid - 1;
                        }
                        else
                        {
                            found = true;
                            break;
                        }
                    }

                    // NOTE: this code is dup'd w/ the code below (in
                    // the outer else clause):
                    if (found)
                    {
                        // Match -- recurse
                        //System.out.println("  match!  arcIdx=" + mid);
                        arc.ArcIdx = mid - 1;
                        m_fst.ReadNextRealArc(arc, @in);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(arc.ArcIdx == mid);
                            Debugging.Assert(arc.Label == targetLabel, "arc.label={0} vs targetLabel={1} mid={2}", arc.Label, targetLabel, mid);
                        }
                        m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output);
                        if (targetLabel == FST.END_LABEL)
                        {
                            return;
                        }
                        CurrentLabel = arc.Label;
                        Incr();
                        arc         = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader);
                        targetLabel = TargetLabel;
                        continue;
                    }
                    else if (high == -1)
                    {
                        //System.out.println("  before first");
                        // Very first arc is after our target
                        // TODO: if each arc could somehow read the arc just
                        // before, we can save this re-scan.  The ceil case
                        // doesn't need this because it reads the next arc
                        // instead:
                        while (true)
                        {
                            // First, walk backwards until we find a first arc
                            // that's before our target label:
                            m_fst.ReadFirstTargetArc(GetArc(m_upto - 1), arc, m_fstReader);
                            if (arc.Label < targetLabel)
                            {
                                // Then, scan forwards to the arc just before
                                // the targetLabel:
                                while (!arc.IsLast && m_fst.ReadNextArcLabel(arc, @in) < targetLabel)
                                {
                                    m_fst.ReadNextArc(arc, m_fstReader);
                                }
                                PushLast();
                                return;
                            }
                            m_upto--;
                            if (m_upto == 0)
                            {
                                return;
                            }
                            targetLabel = TargetLabel;
                            arc         = GetArc(m_upto);
                        }
                    }
                    else
                    {
                        // There is a floor arc:
                        arc.ArcIdx = (low > high ? high : low) - 1;
                        //System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
                        m_fst.ReadNextRealArc(arc, @in);

                        // LUCNENET specific: We don't want the ReadNextArcLabel call to be
                        // excluded when Debug.Assert is stripped out by the compiler.
                        bool check = arc.IsLast || m_fst.ReadNextArcLabel(arc, @in) > targetLabel;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(check);
                            Debugging.Assert(arc.Label < targetLabel, "arc.label={0} vs targetLabel={1}", arc.Label, targetLabel);
                        }
                        PushLast();
                        return;
                    }
                }
                else
                {
                    if (arc.Label == targetLabel)
                    {
                        // Match -- recurse
                        m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output);
                        if (targetLabel == FST.END_LABEL)
                        {
                            return;
                        }
                        CurrentLabel = arc.Label;
                        Incr();
                        arc         = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader);
                        targetLabel = TargetLabel;
                    }
                    else if (arc.Label > targetLabel)
                    {
                        // TODO: if each arc could somehow read the arc just
                        // before, we can save this re-scan.  The ceil case
                        // doesn't need this because it reads the next arc
                        // instead:
                        while (true)
                        {
                            // First, walk backwards until we find a first arc
                            // that's before our target label:
                            m_fst.ReadFirstTargetArc(GetArc(m_upto - 1), arc, m_fstReader);
                            if (arc.Label < targetLabel)
                            {
                                // Then, scan forwards to the arc just before
                                // the targetLabel:
                                while (!arc.IsLast && m_fst.ReadNextArcLabel(arc, m_fstReader) < targetLabel)
                                {
                                    m_fst.ReadNextArc(arc, m_fstReader);
                                }
                                PushLast();
                                return;
                            }
                            m_upto--;
                            if (m_upto == 0)
                            {
                                return;
                            }
                            targetLabel = TargetLabel;
                            arc         = GetArc(m_upto);
                        }
                    }
                    else if (!arc.IsLast)
                    {
                        //System.out.println("  check next label=" + fst.readNextArcLabel(arc) + " (" + (char) fst.readNextArcLabel(arc) + ")");
                        if (m_fst.ReadNextArcLabel(arc, m_fstReader) > targetLabel)
                        {
                            PushLast();
                            return;
                        }
                        else
                        {
                            // keep scanning
                            m_fst.ReadNextArc(arc, m_fstReader);
                        }
                    }
                    else
                    {
                        PushLast();
                        return;
                    }
                }
            }
        }
 public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry entry, BinaryDocValues docToOrds, FST<long?> fst, FST.BytesReader @in, FST.Arc<long?> firstArc, FST.Arc<long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long?> fstEnum, BytesRef @ref, ByteArrayDataInput input)
 {
     this.Entry = entry;
     this.DocToOrds = docToOrds;
     this.Fst = fst;
     this.@in = @in;
     this.FirstArc = firstArc;
     this.ScratchArc = scratchArc;
     this.ScratchInts = scratchInts;
     this.FstEnum = fstEnum;
     this.@ref = @ref;
     this.Input = input;
 }
                public SegmentTermsEnum(BlockTreeTermsReader.FieldReader outerInstance)
                {
                    this.OuterInstance = outerInstance;
                    //if (DEBUG) System.out.println("BTTR.init seg=" + segment);
                    Stack = new Frame[0];

                    // Used to hold seek by TermState, or cached seek
                    StaticFrame = new Frame(this, -1);

                    if (outerInstance.Index == null)
                    {
                        FstReader = null;
                    }
                    else
                    {
                        FstReader = OuterInstance.Index.BytesReader;
                    }

                    // Init w/ root block; don't use index since it may
                    // not (and need not) have been loaded
                    for (int arcIdx = 0; arcIdx < Arcs.Length; arcIdx++)
                    {
                        Arcs[arcIdx] = new FST.Arc<BytesRef>();
                    }

                    CurrentFrame = StaticFrame;
                    FST.Arc<BytesRef> arc;
                    if (outerInstance.Index != null)
                    {
                        arc = outerInstance.Index.GetFirstArc(Arcs[0]);
                        // Empty string prefix must have an output in the index!
                        Debug.Assert(arc.IsFinal);
                    }
                    else
                    {
                        arc = null;
                    }
                    CurrentFrame = StaticFrame;
                    //currentFrame = pushFrame(arc, rootCode, 0);
                    //currentFrame.loadBlock();
                    ValidIndexPrefix = 0;
                    // if (DEBUG) {
                    //   System.out.println("init frame state " + currentFrame.ord);
                    //   printSeekState();
                    // }

                    //System.out.println();
                    // computeBlockStats().print(System.out);
                }
Esempio n. 19
0
 public FST.Arc <Int64> GetFirstArc(FST.Arc <Int64> arc)
 {
     return(fst.GetFirstArc(arc));
 }
Esempio n. 20
0
 internal Frame()
 {
     this.arc   = new FST.Arc <long?>();
     this.state = -1;
 }
Esempio n. 21
0
 public FST.Arc <long?> GetFirstArc(FST.Arc <long?> arc)
 {
     return(fst.GetFirstArc(arc));
 }
Esempio n. 22
0
        public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(num > 0);
            }

            if (onlyMorePopular)
            {
                throw new ArgumentException("this suggester only works with onlyMorePopular=false");
            }
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            if (fst == null)
            {
                return(Collections.EmptyList <LookupResult>());
            }

            //System.out.println("lookup key=" + key + " num=" + num);
            for (var i = 0; i < key.Length; i++)
            {
                if (key[i] == 0x1E)
                {
                    throw new ArgumentException(
                              "lookup key cannot contain HOLE character U+001E; this character is reserved");
                }
                if (key[i] == 0x1F)
                {
                    throw new ArgumentException(
                              "lookup key cannot contain unit separator character U+001F; this character is reserved");
                }
            }

            var utf8Key = new BytesRef(key);

            try
            {
                Automaton lookupAutomaton = ToLookupAutomaton(key);

                var spare = new CharsRef();

                //System.out.println("  now intersect exactFirst=" + exactFirst);

                // Intersect automaton w/ suggest wFST and get all
                // prefix starting nodes & their outputs:
                //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

                //System.out.println("  prefixPaths: " + prefixPaths.size());

                FST.BytesReader bytesReader = fst.GetBytesReader();

                var scratchArc = new FST.Arc <PairOutputs <long?, BytesRef> .Pair>();

                IList <LookupResult> results = new List <LookupResult>();

                IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths =
                    FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst);

                if (exactFirst)
                {
                    int count = 0;
                    foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths)
                    {
                        if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                        {
                            // This node has END_BYTE arc leaving, meaning it's an
                            // "exact" match:
                            count++;
                        }
                    }

                    // Searcher just to find the single exact only
                    // match, if present:
                    Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher;
                    searcher = new Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm,
                                                                                                    count * maxSurfaceFormsPerAnalyzedForm, weightComparer);

                    // NOTE: we could almost get away with only using
                    // the first start node.  The only catch is if
                    // maxSurfaceFormsPerAnalyzedForm had kicked in and
                    // pruned our exact match from one of these nodes
                    // ...:
                    foreach (var path in prefixPaths)
                    {
                        if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null)
                        {
                            // This node has END_BYTE arc leaving, meaning it's an
                            // "exact" match:
                            searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false,
                                                   path.Input);
                        }
                    }

                    var completions = searcher.Search();
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(completions.IsComplete);
                    }

                    // NOTE: this is rather inefficient: we enumerate
                    // every matching "exactly the same analyzed form"
                    // path, and then do linear scan to see if one of
                    // these exactly matches the input.  It should be
                    // possible (though hairy) to do something similar
                    // to getByOutput, since the surface form is encoded
                    // into the FST output, so we more efficiently hone
                    // in on the exact surface-form match.  Still, I
                    // suspect very little time is spent in this linear
                    // seach: it's bounded by how many prefix start
                    // nodes we have and the
                    // maxSurfaceFormsPerAnalyzedForm:
                    foreach (var completion in completions)
                    {
                        BytesRef output2 = completion.Output.Output2;
                        if (SameSurfaceForm(utf8Key, output2))
                        {
                            results.Add(GetLookupResult(completion.Output.Output1, output2, spare));
                            break;
                        }
                    }

                    if (results.Count == num)
                    {
                        // That was quick:
                        return(results);
                    }
                }

                Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher2;
                searcher2 = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count,
                                                                      num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results);

                prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst);

                foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths)
                {
                    searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input);
                }

                var completions2 = searcher2.Search();
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(completions2.IsComplete);
                }

                foreach (Util.Fst.Util.Result <PairOutputs <long?, BytesRef> .Pair> completion in completions2)
                {
                    LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare);

                    // TODO: for fuzzy case would be nice to return
                    // how many edits were required

                    //System.out.println("    result=" + result);
                    results.Add(result);

                    if (results.Count == num)
                    {
                        // In the exactFirst=true case the search may
                        // produce one extra path
                        break;
                    }
                }

                return(results);
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus.ToString(), bogus);
            }
        }
 public SortedDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry,
     NumericDocValues numericDocValues, FST<long?> fst1, FST.BytesReader @in, FST.Arc<long?> arc, FST.Arc<long?> scratchArc1,
     IntsRef intsRef, BytesRefFSTEnum<long?> bytesRefFstEnum)
 {
     entry = fstEntry;
     docToOrd = numericDocValues;
     fst = fst1;
     this.@in = @in;
     firstArc = arc;
     scratchArc = scratchArc1;
     scratchInts = intsRef;
     fstEnum = bytesRefFstEnum;
 }
Esempio n. 24
0
        // NOTE: copied from WFSTCompletionLookup & tweaked
        private long?LookupPrefix(FST <long?> fst, FST.BytesReader bytesReader, BytesRef scratch, FST.Arc <long?> arc)  //Bogus
        {
            long?output = fst.Outputs.NoOutput;

            fst.GetFirstArc(arc);

            var bytes = scratch.Bytes;
            var pos   = scratch.Offset;
            var end   = pos + scratch.Length;

            while (pos < end)
            {
                if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null)
                {
                    return(null);
                }
                else
                {
                    output = fst.Outputs.Add(output, arc.Output);
                }
            }

            return(output);
        }
        public override SortedSetDocValues GetSortedSet(FieldInfo field)
        {
            var entry = fsts[field.Number];
            if (entry.numOrds == 0)
            {
                return DocValues.EMPTY_SORTED_SET; // empty FST!
            }
            FST<long?> instance;
            lock (this)
            {
                instance = fstInstances[field.Number];
                if (instance == null)
                {
                    data.Seek(entry.offset);
                    instance = new FST<long?>(data, PositiveIntOutputs.Singleton);
                    ramBytesUsed.AddAndGet(instance.SizeInBytes());
                    fstInstances[field.Number] = instance;
                }
            }
            var docToOrds = GetBinary(field);
            var fst = instance;

            // per-thread resources
            var @in = fst.BytesReader;
            var firstArc = new FST.Arc<long?>();
            var scratchArc = new FST.Arc<long?>();
            var scratchInts = new IntsRef();
            var fstEnum = new BytesRefFSTEnum<long?>(fst);
            var @ref = new BytesRef();
            var input = new ByteArrayDataInput();
            return new SortedSetDocValuesAnonymousInnerClassHelper(entry, docToOrds, fst, @in, firstArc,
                scratchArc, scratchInts, fstEnum, @ref, input);
        }
Esempio n. 26
0
 public SortedSetDocValuesAnonymousClass(FSTEntry fstEntry, BinaryDocValues binaryDocValues, FST <long?> fst1,
                                         FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc1, Int32sRef intsRef, BytesRefFSTEnum <long?> bytesRefFstEnum,
                                         BytesRef @ref, ByteArrayDataInput byteArrayDataInput)
 {
     entry       = fstEntry;
     docToOrds   = binaryDocValues;
     fst         = fst1;
     this.@in    = @in;
     firstArc    = arc;
     scratchArc  = scratchArc1;
     scratchInts = intsRef;
     fstEnum     = bytesRefFstEnum;
     this.@ref   = @ref;
     input       = byteArrayDataInput;
 }
 public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, BinaryDocValues binaryDocValues, FST<long?> fst1,
     FST.BytesReader @in, FST.Arc<long?> arc, FST.Arc<long?> scratchArc1, IntsRef intsRef, BytesRefFSTEnum<long?> bytesRefFstEnum,
     BytesRef @ref, ByteArrayDataInput byteArrayDataInput)
 {
     entry = fstEntry;
     docToOrds = binaryDocValues;
     fst = fst1;
     this.@in = @in;
     firstArc = arc;
     scratchArc = scratchArc1;
     scratchInts = intsRef;
     fstEnum = bytesRefFstEnum;
     this.@ref = @ref;
     input = byteArrayDataInput;
 }
Esempio n. 28
0
        /// <summary>
        /// Expert: like <see cref="Util.GetByOutput(FST{long?}, long)"/> except reusing
        /// <see cref="FST.BytesReader"/>, initial and scratch Arc, and result.
        /// </summary>
        public static Int32sRef GetByOutput(FST <long?> fst, long targetOutput, FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc, Int32sRef result)
        {
            long output = arc.Output.Value;
            int  upto   = 0;

            //System.out.println("reverseLookup output=" + targetOutput);

            while (true)
            {
                //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc);
                if (arc.IsFinal)
                {
                    long finalOutput = output + arc.NextFinalOutput.Value;
                    //System.out.println("  isFinal finalOutput=" + finalOutput);
                    if (finalOutput == targetOutput)
                    {
                        result.Length = upto;
                        //System.out.println("    found!");
                        return(result);
                    }
                    else if (finalOutput > targetOutput)
                    {
                        //System.out.println("    not found!");
                        return(null);
                    }
                }

                if (FST <long?> .TargetHasArcs(arc))
                {
                    //System.out.println("  targetHasArcs");
                    if (result.Int32s.Length == upto)
                    {
                        result.Grow(1 + upto);
                    }

                    fst.ReadFirstRealTargetArc(arc.Target, arc, @in);

                    if (arc.BytesPerArc != 0)
                    {
                        int low  = 0;
                        int high = arc.NumArcs - 1;
                        int mid  = 0;
                        //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output);
                        bool exact = false;
                        while (low <= high)
                        {
                            mid          = (int)((uint)(low + high) >> 1);
                            @in.Position = arc.PosArcsStart;
                            @in.SkipBytes(arc.BytesPerArc * mid);
                            var flags = (sbyte)@in.ReadByte();
                            fst.ReadLabel(@in);
                            long minArcOutput;
                            if ((flags & FST.BIT_ARC_HAS_OUTPUT) != 0)
                            {
                                long arcOutput = fst.Outputs.Read(@in).Value;
                                minArcOutput = output + arcOutput;
                            }
                            else
                            {
                                minArcOutput = output;
                            }
                            if (minArcOutput == targetOutput)
                            {
                                exact = true;
                                break;
                            }
                            else if (minArcOutput < targetOutput)
                            {
                                low = mid + 1;
                            }
                            else
                            {
                                high = mid - 1;
                            }
                        }

                        if (high == -1)
                        {
                            return(null);
                        }
                        else if (exact)
                        {
                            arc.ArcIdx = mid - 1;
                        }
                        else
                        {
                            arc.ArcIdx = low - 2;
                        }

                        fst.ReadNextRealArc(arc, @in);
                        result.Int32s[upto++] = arc.Label;
                        output += arc.Output.Value;
                    }
                    else
                    {
                        FST.Arc <long?> prevArc = null;

                        while (true)
                        {
                            //System.out.println("    cycle label=" + arc.label + " output=" + arc.output);

                            // this is the min output we'd hit if we follow
                            // this arc:
                            long minArcOutput = output + arc.Output.Value;

                            if (minArcOutput == targetOutput)
                            {
                                // Recurse on this arc:
                                //System.out.println("  match!  break");
                                output = minArcOutput;
                                result.Int32s[upto++] = arc.Label;
                                break;
                            }
                            else if (minArcOutput > targetOutput)
                            {
                                if (prevArc == null)
                                {
                                    // Output doesn't exist
                                    return(null);
                                }
                                else
                                {
                                    // Recurse on previous arc:
                                    arc.CopyFrom(prevArc);
                                    result.Int32s[upto++] = arc.Label;
                                    output += arc.Output.Value;
                                    //System.out.println("    recurse prev label=" + (char) arc.label + " output=" + output);
                                    break;
                                }
                            }
                            else if (arc.IsLast)
                            {
                                // Recurse on this arc:
                                output = minArcOutput;
                                //System.out.println("    recurse last label=" + (char) arc.label + " output=" + output);
                                result.Int32s[upto++] = arc.Label;
                                break;
                            }
                            else
                            {
                                // Read next arc in this node:
                                prevArc = scratchArc;
                                prevArc.CopyFrom(arc);
                                //System.out.println("      after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
                                fst.ReadNextRealArc(arc, @in);
                            }
                        }
                    }
                }
                else
                {
                    //System.out.println("  no target arcs; not found!");
                    return(null);
                }
            }
        }
Esempio n. 29
0
        /// <summary>
        /// Recursive collect lookup results from the automaton subgraph starting at
        /// <paramref name="arc"/>.
        /// </summary>
        /// <param name="num">
        ///          Maximum number of results needed (early termination). </param>
        private bool Collect(IList <Completion> res, int num, int bucket, BytesRef output, FST.Arc <object> arc)
        {
            if (output.Length == output.Bytes.Length)
            {
                output.Bytes = ArrayUtil.Grow(output.Bytes);
            }
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(output.Offset == 0);
            }
            output.Bytes[output.Length++] = (byte)arc.Label;
            FST.BytesReader fstReader = automaton.GetBytesReader();
            automaton.ReadFirstTargetArc(arc, arc, fstReader);
            while (true)
            {
                if (arc.Label == Lucene.Net.Util.Fst.FST.END_LABEL)
                {
                    res.Add(new Completion(output, bucket));
                    if (res.Count >= num)
                    {
                        return(true);
                    }
                }
                else
                {
                    int save = output.Length;
                    if (Collect(res, num, bucket, output, (new FST.Arc <object>()).CopyFrom(arc)))
                    {
                        return(true);
                    }
                    output.Length = save;
                }

                if (arc.IsLast)
                {
                    break;
                }
                automaton.ReadNextArc(arc, fstReader);
            }
            return(false);
        }
Esempio n. 30
0
        /// <summary>
        /// Dumps an <see cref="FST{T}"/> to a GraphViz's <c>dot</c> language description
        /// for visualization. Example of use:
        ///
        /// <code>
        /// using (TextWriter sw = new StreamWriter(&quot;out.dot&quot;))
        /// {
        ///     Util.ToDot(fst, sw, true, true);
        /// }
        /// </code>
        ///
        /// and then, from command line:
        ///
        /// <code>
        /// dot -Tpng -o out.png out.dot
        /// </code>
        ///
        /// <para/>
        /// Note: larger FSTs (a few thousand nodes) won't even
        /// render, don't bother.  If the FST is &gt; 2.1 GB in size
        /// then this method will throw strange exceptions.
        /// <para/>
        /// See also <a href="http://www.graphviz.org/">http://www.graphviz.org/</a>.
        /// </summary>
        /// <param name="sameRank">
        ///          If <c>true</c>, the resulting <c>dot</c> file will try
        ///          to order states in layers of breadth-first traversal. This may
        ///          mess up arcs, but makes the output FST's structure a bit clearer.
        /// </param>
        /// <param name="labelStates">
        ///          If <c>true</c> states will have labels equal to their offsets in their
        ///          binary format. Expands the graph considerably.
        /// </param>
        public static void ToDot <T>(FST <T> fst, TextWriter @out, bool sameRank, bool labelStates)
        {
            const string expandedNodeColor = "blue";

            // this is the start arc in the automaton (from the epsilon state to the first state
            // with outgoing transitions.
            FST.Arc <T> startArc = fst.GetFirstArc(new FST.Arc <T>());

            // A queue of transitions to consider for the next level.
            IList <FST.Arc <T> > thisLevelQueue = new List <FST.Arc <T> >();

            // A queue of transitions to consider when processing the next level.
            IList <FST.Arc <T> > nextLevelQueue = new List <FST.Arc <T> >();

            nextLevelQueue.Add(startArc);
            //System.out.println("toDot: startArc: " + startArc);

            // A list of states on the same level (for ranking).
            IList <int?> sameLevelStates = new List <int?>();

            // A bitset of already seen states (target offset).
            BitArray seen = new BitArray(32);

            seen.SafeSet((int)startArc.Target, true);

            // Shape for states.
            const string stateShape      = "circle";
            const string finalStateShape = "doublecircle";

            // Emit DOT prologue.
            @out.Write("digraph FST {\n");
            @out.Write("  rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");

            if (!labelStates)
            {
                @out.Write("  node [shape=circle, width=.2, height=.2, style=filled]\n");
            }

            EmitDotState(@out, "initial", "point", "white", "");

            T   NO_OUTPUT = fst.Outputs.NoOutput;
            var r         = fst.GetBytesReader();

            // final FST.Arc<T> scratchArc = new FST.Arc<>();

            {
                string stateColor;
                if (fst.IsExpandedTarget(startArc, r))
                {
                    stateColor = expandedNodeColor;
                }
                else
                {
                    stateColor = null;
                }

                bool isFinal;
                T    finalOutput;
                if (startArc.IsFinal)
                {
                    isFinal     = true;
                    finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default(T) : startArc.NextFinalOutput;
                }
                else
                {
                    isFinal     = false;
                    finalOutput = default(T);
                }

                EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput));
            }

            @out.Write("  initial -> " + startArc.Target + "\n");

            int level = 0;

            while (nextLevelQueue.Count > 0)
            {
                // we could double buffer here, but it doesn't matter probably.
                //System.out.println("next level=" + level);
                thisLevelQueue.AddRange(nextLevelQueue);
                nextLevelQueue.Clear();

                level++;
                @out.Write("\n  // Transitions and states at level: " + level + "\n");
                while (thisLevelQueue.Count > 0)
                {
                    FST.Arc <T> arc = thisLevelQueue[thisLevelQueue.Count - 1];
                    thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1);
                    //System.out.println("  pop: " + arc);
                    if (FST <T> .TargetHasArcs(arc))
                    {
                        // scan all target arcs
                        //System.out.println("  readFirstTarget...");

                        long node = arc.Target;

                        fst.ReadFirstRealTargetArc(arc.Target, arc, r);

                        //System.out.println("    firstTarget: " + arc);

                        while (true)
                        {
                            //System.out.println("  cycle arc=" + arc);
                            // Emit the unseen state and add it to the queue for the next level.
                            if (arc.Target >= 0 && !seen.SafeGet((int)arc.Target))
                            {
                                /*
                                 * boolean isFinal = false;
                                 * T finalOutput = null;
                                 * fst.readFirstTargetArc(arc, scratchArc);
                                 * if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) {
                                 * // target is final
                                 * isFinal = true;
                                 * finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output;
                                 * System.out.println("dot hit final label=" + (char) scratchArc.label);
                                 * }
                                 */
                                string stateColor;
                                if (fst.IsExpandedTarget(arc, r))
                                {
                                    stateColor = expandedNodeColor;
                                }
                                else
                                {
                                    stateColor = null;
                                }

                                string finalOutput;
                                if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT))
                                {
                                    finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput);
                                }
                                else
                                {
                                    finalOutput = "";
                                }

                                EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput);
                                // To see the node address, use this instead:
                                //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
                                seen.SafeSet((int)arc.Target, true);
                                nextLevelQueue.Add((new FST.Arc <T>()).CopyFrom(arc));
                                sameLevelStates.Add((int)arc.Target);
                            }

                            string outs;
                            if (!arc.Output.Equals(NO_OUTPUT))
                            {
                                outs = "/" + fst.Outputs.OutputToString(arc.Output);
                            }
                            else
                            {
                                outs = "";
                            }

                            if (!FST <T> .TargetHasArcs(arc) && arc.IsFinal && !arc.NextFinalOutput.Equals(NO_OUTPUT))
                            {
                                // Tricky special case: sometimes, due to
                                // pruning, the builder can [sillily] produce
                                // an FST with an arc into the final end state
                                // (-1) but also with a next final output; in
                                // this case we pull that output up onto this
                                // arc
                                outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]";
                            }

                            string arcColor;
                            if (arc.Flag(FST.BIT_TARGET_NEXT))
                            {
                                arcColor = "red";
                            }
                            else
                            {
                                arcColor = "black";
                            }

                            Debug.Assert(arc.Label != FST.END_LABEL);
                            @out.Write("  " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.IsFinal ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n");

                            // Break the loop if we're on the last arc of this state.
                            if (arc.IsLast)
                            {
                                //System.out.println("    break");
                                break;
                            }
                            fst.ReadNextRealArc(arc, r);
                        }
                    }
                }

                // Emit state ranking information.
                if (sameRank && sameLevelStates.Count > 1)
                {
                    @out.Write("  {rank=same; ");
                    foreach (int state in sameLevelStates)
                    {
                        @out.Write(state + "; ");
                    }
                    @out.Write(" }\n");
                }
                sameLevelStates.Clear();
            }

            // Emit terminating state (always there anyway).
            @out.Write("  -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n");
            @out.Write("  {rank=sink; -1 }\n");

            @out.Write("}\n");
            @out.Flush();
        }
Esempio n. 31
0
 internal Frame()
 {
     this.fstArc   = new FST.Arc <FSTTermOutputs.TermData>();
     this.fsaState = -1;
 }
Esempio n. 32
0
        // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND /
        // SEEK_END)?  saves the eq check above?

        /// <summary>
        /// Seeks to smallest term that's &gt;= target. </summary>
        protected virtual void DoSeekCeil()
        {
            //System.out.println("    advance len=" + target.length + " curlen=" + current.length);

            // TODO: possibly caller could/should provide common
            // prefix length?  ie this work may be redundant if
            // caller is in fact intersecting against its own
            // automaton

            //System.out.println("FE.seekCeil upto=" + upto);

            // Save time by starting at the end of the shared prefix
            // b/w our current term & the target:
            RewindPrefix();
            //System.out.println("  after rewind upto=" + upto);

            FST.Arc <T> arc         = GetArc(m_upto);
            int         targetLabel = TargetLabel;

            //System.out.println("  init targetLabel=" + targetLabel);

            // Now scan forward, matching the new suffix of the target
            while (true)
            {
                //System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel);

                if (arc.BytesPerArc != 0 && arc.Label != -1)
                {
                    // Arcs are fixed array -- use binary search to find
                    // the target.

                    FST.BytesReader @in  = m_fst.GetBytesReader();
                    int             low  = arc.ArcIdx;
                    int             high = arc.NumArcs - 1;
                    int             mid  = 0;
                    //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel);
                    bool found = false;
                    while (low <= high)
                    {
                        mid          = (low + high).TripleShift(1);
                        @in.Position = arc.PosArcsStart;
                        @in.SkipBytes(arc.BytesPerArc * mid + 1);
                        int midLabel = m_fst.ReadLabel(@in);
                        int cmp      = midLabel - targetLabel;
                        //System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
                        if (cmp < 0)
                        {
                            low = mid + 1;
                        }
                        else if (cmp > 0)
                        {
                            high = mid - 1;
                        }
                        else
                        {
                            found = true;
                            break;
                        }
                    }

                    // NOTE: this code is dup'd w/ the code below (in
                    // the outer else clause):
                    if (found)
                    {
                        // Match
                        arc.ArcIdx = mid - 1;
                        m_fst.ReadNextRealArc(arc, @in);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(arc.ArcIdx == mid);
                            Debugging.Assert(arc.Label == targetLabel, "arc.label={0} vs targetLabel={1} mid={2}", arc.Label, targetLabel, mid);
                        }
                        m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output);
                        if (targetLabel == FST.END_LABEL)
                        {
                            return;
                        }
                        CurrentLabel = arc.Label;
                        Incr();
                        arc         = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader);
                        targetLabel = TargetLabel;
                        continue;
                    }
                    else if (low == arc.NumArcs)
                    {
                        // Dead end
                        arc.ArcIdx = arc.NumArcs - 2;
                        m_fst.ReadNextRealArc(arc, @in);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(arc.IsLast);
                        }
                        // Dead end (target is after the last arc);
                        // rollback to last fork then push
                        m_upto--;
                        while (true)
                        {
                            if (m_upto == 0)
                            {
                                return;
                            }
                            FST.Arc <T> prevArc = GetArc(m_upto);
                            //System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast());
                            if (!prevArc.IsLast)
                            {
                                m_fst.ReadNextArc(prevArc, m_fstReader);
                                PushFirst();
                                return;
                            }
                            m_upto--;
                        }
                    }
                    else
                    {
                        arc.ArcIdx = (low > high ? low : high) - 1;
                        m_fst.ReadNextRealArc(arc, @in);
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(arc.Label > targetLabel);
                        }
                        PushFirst();
                        return;
                    }
                }
                else
                {
                    // Arcs are not array'd -- must do linear scan:
                    if (arc.Label == targetLabel)
                    {
                        // recurse
                        m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output);
                        if (targetLabel == FST.END_LABEL)
                        {
                            return;
                        }
                        CurrentLabel = arc.Label;
                        Incr();
                        arc         = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader);
                        targetLabel = TargetLabel;
                    }
                    else if (arc.Label > targetLabel)
                    {
                        PushFirst();
                        return;
                    }
                    else if (arc.IsLast)
                    {
                        // Dead end (target is after the last arc);
                        // rollback to last fork then push
                        m_upto--;
                        while (true)
                        {
                            if (m_upto == 0)
                            {
                                return;
                            }
                            FST.Arc <T> prevArc = GetArc(m_upto);
                            //System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast());
                            if (!prevArc.IsLast)
                            {
                                m_fst.ReadNextArc(prevArc, m_fstReader);
                                PushFirst();
                                return;
                            }
                            m_upto--;
                        }
                    }
                    else
                    {
                        // keep scanning
                        //System.out.println("    next scan");
                        m_fst.ReadNextArc(arc, m_fstReader);
                    }
                }
            }
        }
Esempio n. 33
0
        public override int Read()
        {
            //System.out.println("\nread");
            while (true)
            {
                if (replacement != null && replacementPointer < replacement.Length)
                {
                    //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
                    return(replacement.chars[replacement.offset + replacementPointer++]);
                }

                // TODO: a more efficient approach would be Aho/Corasick's
                // algorithm
                // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
                // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
                //
                // I think this would be (almost?) equivalent to 1) adding
                // epsilon arcs from all final nodes back to the init
                // node in the FST, 2) adding a .* (skip any char)
                // loop on the initial node, and 3) determinizing
                // that.  Then we would not have to restart matching
                // at each position.

                int      lastMatchLen = -1;
                CharsRef lastMatch    = null;

                int firstCH = buffer.Get(inputOff);
                if (firstCH != -1)
                {
                    FST.Arc <CharsRef> arc = cachedRootArcs[Convert.ToChar((char)firstCH)];
                    if (arc != null)
                    {
                        if (!FST.TargetHasArcs(arc))
                        {
                            // Fast pass for single character match:
                            Debug.Assert(arc.Final);
                            lastMatchLen = 1;
                            lastMatch    = arc.Output;
                        }
                        else
                        {
                            int      lookahead = 0;
                            CharsRef output    = arc.Output;
                            while (true)
                            {
                                lookahead++;

                                if (arc.Final)
                                {
                                    // Match! (to node is final)
                                    lastMatchLen = lookahead;
                                    lastMatch    = outputs.Add(output, arc.NextFinalOutput);
                                    // Greedy: keep searching to see if there's a
                                    // longer match...
                                }

                                if (!FST.TargetHasArcs(arc))
                                {
                                    break;
                                }

                                int ch = buffer.Get(inputOff + lookahead);
                                if (ch == -1)
                                {
                                    break;
                                }
                                if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) == null)
                                {
                                    // Dead end
                                    break;
                                }
                                output = outputs.Add(output, arc.Output);
                            }
                        }
                    }
                }

                if (lastMatch != null)
                {
                    inputOff += lastMatchLen;
                    //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);
                    int diff = lastMatchLen - lastMatch.Length;

                    if (diff != 0)
                    {
                        int prevCumulativeDiff = LastCumulativeDiff;
                        if (diff > 0)
                        {
                            // Replacement is shorter than matched input:
                            AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
                        }
                        else
                        {
                            // Replacement is longer than matched input: remap
                            // the "extra" chars all back to the same input
                            // offset:
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int outputStart = inputOff - prevCumulativeDiff;
                            int outputStart = inputOff - prevCumulativeDiff;
                            for (int extraIDX = 0; extraIDX < -diff; extraIDX++)
                            {
                                AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
                            }
                        }
                    }

                    replacement        = lastMatch;
                    replacementPointer = 0;
                }
                else
                {
                    int ret = buffer.Get(inputOff);
                    if (ret != -1)
                    {
                        inputOff++;
                        buffer.FreeBefore(inputOff);
                    }
                    return(ret);
                }
            }
        }
Esempio n. 34
0
        /// <summary>
        /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>,
        /// accumulating the <see cref="FST"/> end node and output for each path.
        /// </summary>
        public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst)
        {
            Debug.Assert(a.IsDeterministic);
            IList <Path <T> > queue    = new List <Path <T> >();
            List <Path <T> >  endNodes = new List <Path <T> >();

            queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef()));

            FST.Arc <T>     scratchArc = new FST.Arc <T>();
            FST.BytesReader fstReader  = fst.GetBytesReader();

            while (queue.Count != 0)
            {
                Path <T> path = queue.ElementAt(queue.Count - 1);
                queue.Remove(path);
                if (path.State.Accept)
                {
                    endNodes.Add(path);
                    // we can stop here if we accept this path,
                    // we accept all further paths too
                    continue;
                }

                Int32sRef currentInput = path.Input;
                foreach (Transition t in path.State.GetTransitions())
                {
                    int min = t.Min;
                    int max = t.Max;
                    if (min == max)
                    {
                        FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader);
                        if (nextArc != null)
                        {
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = t.Min;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                        }
                    }
                    else
                    {
                        // TODO: if this transition's TO state is accepting, and
                        // it accepts the entire range possible in the FST (ie. 0 to 255),
                        // we can simply use the prefix as the accepted state instead of
                        // looking up all the ranges and terminate early
                        // here.  This just shifts the work from one queue
                        // (this one) to another (the completion search
                        // done in AnalyzingSuggester).

                        FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader);
                        while (nextArc != null && nextArc.Label <= max)
                        {
                            Debug.Assert(nextArc.Label <= max);
                            Debug.Assert(nextArc.Label >= min, nextArc.Label + " " + min);
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = nextArc.Label;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                            int label = nextArc.Label; // used in assert
                            nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader);
                            Debug.Assert(nextArc == null || label < nextArc.Label, "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString()));
                        }
                    }
                }
            }
            return(endNodes);
        }
Esempio n. 35
0
        /// <summary>
        /// Generates a list of stems for the provided word
        /// </summary>
        /// <param name="word"> Word to generate the stems for </param>
        /// <param name="length"> length </param>
        /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param>
        /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param>
        /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param>
        /// <param name="recursionDepth"> current recursiondepth </param>
        /// <param name="doPrefix"> true if we should remove prefixes </param>
        /// <param name="doSuffix"> true if we should remove suffixes </param>
        /// <param name="previousWasPrefix"> true if the previous removal was a prefix:
        ///        if we are removing a suffix, and it has no continuation requirements, its ok.
        ///        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param>
        /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix
        ///        this means inner most suffix must also contain circumfix flag. </param>
        /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param>
        /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns>
        private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant)
        {
            // TODO: allow this stuff to be reused by tokenfilter
            JCG.List <CharsRef> stems = new JCG.List <CharsRef>();

            if (doPrefix && dictionary.prefixes != null)
            {
                FST <Int32sRef>     fst         = dictionary.prefixes;
                Outputs <Int32sRef> outputs     = fst.Outputs;
                FST.BytesReader     bytesReader = prefixReaders[recursionDepth];
                FST.Arc <Int32sRef> arc         = prefixArcs[recursionDepth];
                fst.GetFirstArc(arc);
                Int32sRef NO_OUTPUT = outputs.NoOutput;
                Int32sRef output    = NO_OUTPUT;
                int       limit     = dictionary.fullStrip ? length : length - 1;
                for (int i = 0; i < limit; i++)
                {
                    if (i > 0)
                    {
                        int ch = word[i - 1];
                        if (fst.FindTargetArc(ch, arc, arc, bytesReader) is null)
                        {
                            break;
                        }
                        else if (arc.Output != NO_OUTPUT)
                        {
                            output = fst.Outputs.Add(output, arc.Output);
                        }
                    }
                    Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment
                    if (!arc.IsFinal)
                    {
                        continue;
                    }
                    else
                    {
                        prefixes = fst.Outputs.Add(output, arc.NextFinalOutput);
                    }

                    for (int j = 0; j < prefixes.Length; j++)
                    {
                        int prefix = prefixes.Int32s[prefixes.Offset + j];
                        if (prefix == previous)
                        {
                            continue;
                        }
                        affixReader.Position = 8 * prefix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = condition.TripleShift(1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                        {
                            if (dictionary.onlyincompound == -1)
                            {
                                compatible = true;
                            }
                            else
                            {
                                // check if affix is allowed in a non-compound word
                                dictionary.flagLookup.Get(append, scratch);
                                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                                compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            }
                        }
                        else if (crossProduct)
                        {
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(prevFlag >= 0);
                            }
                            bool allowed = dictionary.onlyincompound == -1 ||
                                           !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false);
                        }
                        else
                        {
                            compatible = false;
                        }

                        if (compatible)
                        {
                            int deAffixedStart  = i;
                            int deAffixedLength = length - deAffixedStart;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength))
                            {
                                continue;
                            }

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                            Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant);

                            stems.AddRange(stemList);
                        }
                    }
                }
            }

            if (doSuffix && dictionary.suffixes != null)
            {
                FST <Int32sRef>     fst         = dictionary.suffixes;
                Outputs <Int32sRef> outputs     = fst.Outputs;
                FST.BytesReader     bytesReader = suffixReaders[recursionDepth];
                FST.Arc <Int32sRef> arc         = suffixArcs[recursionDepth];
                fst.GetFirstArc(arc);
                Int32sRef NO_OUTPUT = outputs.NoOutput;
                Int32sRef output    = NO_OUTPUT;
                int       limit     = dictionary.fullStrip ? 0 : 1;
                for (int i = length; i >= limit; i--)
                {
                    if (i < length)
                    {
                        int ch = word[i];
                        if (fst.FindTargetArc(ch, arc, arc, bytesReader) is null)
                        {
                            break;
                        }
                        else if (arc.Output != NO_OUTPUT)
                        {
                            output = fst.Outputs.Add(output, arc.Output);
                        }
                    }
                    Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment
                    if (!arc.IsFinal)
                    {
                        continue;
                    }
                    else
                    {
                        suffixes = fst.Outputs.Add(output, arc.NextFinalOutput);
                    }

                    for (int j = 0; j < suffixes.Length; j++)
                    {
                        int suffix = suffixes.Int32s[suffixes.Offset + j];
                        if (suffix == previous)
                        {
                            continue;
                        }
                        affixReader.Position = 8 * suffix;
                        char flag         = (char)(affixReader.ReadInt16() & 0xffff);
                        char stripOrd     = (char)(affixReader.ReadInt16() & 0xffff);
                        int  condition    = (char)(affixReader.ReadInt16() & 0xffff);
                        bool crossProduct = (condition & 1) == 1;
                        condition = condition.TripleShift(1);
                        char append = (char)(affixReader.ReadInt16() & 0xffff);

                        bool compatible;
                        if (recursionDepth == 0)
                        {
                            if (dictionary.onlyincompound == -1)
                            {
                                compatible = true;
                            }
                            else
                            {
                                // check if affix is allowed in a non-compound word
                                dictionary.flagLookup.Get(append, scratch);
                                char[] appendFlags = Dictionary.DecodeFlags(scratch);
                                compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            }
                        }
                        else if (crossProduct)
                        {
                            // cross check incoming continuation class (flag of previous affix) against list.
                            dictionary.flagLookup.Get(append, scratch);
                            char[] appendFlags = Dictionary.DecodeFlags(scratch);
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(prevFlag >= 0);
                            }
                            bool allowed = dictionary.onlyincompound == -1 ||
                                           !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound);
                            compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
                        }
                        else
                        {
                            compatible = false;
                        }

                        if (compatible)
                        {
                            int appendLength    = length - i;
                            int deAffixedLength = length - appendLength;

                            int stripStart  = dictionary.stripOffsets[stripOrd];
                            int stripEnd    = dictionary.stripOffsets[stripOrd + 1];
                            int stripLength = stripEnd - stripStart;

                            if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength))
                            {
                                continue;
                            }

                            char[] strippedWord = new char[stripLength + deAffixedLength];
                            Array.Copy(word, 0, strippedWord, 0, deAffixedLength);
                            Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);

                            IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);

                            stems.AddRange(stemList);
                        }
                    }
                }
            }

            return(stems);
        }
Esempio n. 36
0
 public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum)
 {
     this.entry       = entry;
     this.docToOrd    = docToOrd;
     this.fst         = fst;
     this.@in         = @in;
     this.firstArc    = firstArc;
     this.scratchArc  = scratchArc;
     this.scratchInts = scratchInts;
     this.fstEnum     = fstEnum;
 }
                // TODO: in some cases we can filter by length?  eg
                // regexp foo*bar must be at least length 6 bytes
                public IntersectEnum(BlockTreeTermsReader.FieldReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm)
                {
                    this.OuterInstance = outerInstance;
                    // if (DEBUG) {
                    //   System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef));
                    // }
                    runAutomaton = compiled.RunAutomaton;
                    CompiledAutomaton = compiled;
                    @in = (IndexInput)[email protected]();
                    Stack = new Frame[5];
                    for (int idx = 0; idx < Stack.Length; idx++)
                    {
                        Stack[idx] = new Frame(this, idx);
                    }
                    for (int arcIdx = 0; arcIdx < Arcs.Length; arcIdx++)
                    {
                        Arcs[arcIdx] = new FST.Arc<BytesRef>();
                    }

                    if (outerInstance.Index == null)
                    {
                        FstReader = null;
                    }
                    else
                    {
                        FstReader = outerInstance.Index.BytesReader;
                    }

                    // TODO: if the automaton is "smallish" we really
                    // should use the terms index to seek at least to
                    // the initial term and likely to subsequent terms
                    // (or, maybe just fallback to ATE for such cases).
                    // Else the seek cost of loading the frames will be
                    // too costly.

                    FST.Arc<BytesRef> arc = outerInstance.Index.GetFirstArc(Arcs[0]);
                    // Empty string prefix must have an output in the index!
                    Debug.Assert(arc.IsFinal);

                    // Special pushFrame since it's the first one:
                    Frame f = Stack[0];
                    f.Fp = f.FpOrig = outerInstance.RootBlockFP;
                    f.Prefix = 0;
                    f.State = runAutomaton.InitialState;
                    f.Arc = arc;
                    f.OutputPrefix = arc.Output;
                    f.Load(outerInstance.RootCode);

                    // for assert:
                    Debug.Assert(SetSavedStartTerm(startTerm));

                    CurrentFrame = f;
                    if (startTerm != null)
                    {
                        SeekToStartTerm(startTerm);
                    }
                }