C# (CSharp) Lucene.Net.Util.Fst Util примеры использования

Язык программирования: C# (CSharp)
Пространство имен/Пакет: Lucene.Net.Util.Fst
Класс/Тип: Util
Примеров на hotexamples.com: 1
C# (CSharp) Lucene.Net.Util.Fst Util - 1 пример найден. Это лучшие примеры C# (CSharp) кода для Lucene.Net.Util.Fst.Util, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.
Основные методы
Показать Скрыть
Get(4)
GetByOutput(2)
ToInt32sRef(2)
ToIntsRef(2)
ToDot(1)
toDot(1)
Пример #1
Показать файл
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.TokenStream("", key.ToString());

            try
            {
                TermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <TermToBytesRefAttribute>();
                OffsetAttribute            offsetAtt    = ts.AddAttribute <OffsetAttribute>();
                PositionLengthAttribute    posLenAtt    = ts.AddAttribute <PositionLengthAttribute>();
                PositionIncrementAttribute posIncAtt    = ts.AddAttribute <PositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset());
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.BytesReader;

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                IList <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    nextCompletionBreak :
                    backoff *= ALPHA;
                }

                results.Sort(new ComparatorAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(ts);
            }
        }