Пример #1
0
        private LookupResult GetLookupResult(long?output1, BytesRef output2, CharsRef spare)
        {
            LookupResult result;

            if (hasPayloads)
            {
                int sepIndex = -1;
                for (int i = 0; i < output2.Length; i++)
                {
                    if (output2.Bytes[output2.Offset + i] == PAYLOAD_SEP)
                    {
                        sepIndex = i;
                        break;
                    }
                }
                Debug.Assert(sepIndex != -1);
                spare.Grow(sepIndex);

                int payloadLen = output2.Length - sepIndex - 1;
                UnicodeUtil.UTF8toUTF16(output2.Bytes, output2.Offset, sepIndex, spare);
                BytesRef payload = new BytesRef(payloadLen);
                Array.Copy(output2.Bytes, sepIndex + 1, payload.Bytes, 0, payloadLen);
                payload.Length = payloadLen;
                result         = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()), payload);
            }
            else
            {
                spare.Grow(output2.Length);
                UnicodeUtil.UTF8toUTF16(output2, spare);
                result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()));
            }

            return(result);
        }
Пример #2
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private void readField(util.BytesRef type, index.FieldInfo fieldInfo, index.StoredFieldVisitor visitor) throws java.io.IOException
        private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor)
        {
            readLine();
            Debug.Assert(StringHelper.StartsWith(scratch, VALUE));
            if (type == TYPE_STRING)
            {
                visitor.stringField(fieldInfo, new string(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, StandardCharsets.UTF_8));
            }
            else if (type == TYPE_BINARY)
            {
                sbyte[] copy = new sbyte[scratch.length - VALUE.length];
                Array.Copy(scratch.bytes, scratch.offset + VALUE.length, copy, 0, copy.Length);
                visitor.binaryField(fieldInfo, copy);
            }
            else if (type == TYPE_INT)
            {
                UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16);
                visitor.intField(fieldInfo, Convert.ToInt32(scratchUTF16.ToString()));
            }
            else if (type == TYPE_LONG)
            {
                UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16);
                visitor.longField(fieldInfo, Convert.ToInt64(scratchUTF16.ToString()));
            }
            else if (type == TYPE_FLOAT)
            {
                UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16);
                visitor.floatField(fieldInfo, Convert.ToSingle(scratchUTF16.ToString()));
            }
            else if (type == TYPE_DOUBLE)
            {
                UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16);
                visitor.doubleField(fieldInfo, Convert.ToDouble(scratchUTF16.ToString()));
            }
        }
Пример #3
0
        /// <summary>
        /// Adds terms and frequencies found in vector into the <see cref="T:IDictionary{string, Int}"/> <paramref name="termFreqMap"/>
        /// </summary>
        /// <param name="termFreqMap"> a <see cref="T:IDictionary{string, Int}"/> of terms and their frequencies </param>
        /// <param name="vector"> List of terms and their frequencies for a doc/field </param>
        private void AddTermFrequencies(IDictionary <string, Int32> termFreqMap, Terms vector)
        {
            var      termsEnum = vector.GetIterator(null);
            var      spare     = new CharsRef();
            BytesRef text;

            while ((text = termsEnum.Next()) != null)
            {
                UnicodeUtil.UTF8toUTF16(text, spare);
                var term = spare.ToString();
                if (IsNoiseWord(term))
                {
                    continue;
                }
                var freq = (int)termsEnum.TotalTermFreq;

                // increment frequency
                Int32 cnt;
                if (!termFreqMap.TryGetValue(term, out cnt))
                {
                    cnt = new Int32();
                    termFreqMap[term] = cnt;
                    cnt.x             = freq;
                }
                else
                {
                    cnt.x += freq;
                }
            }
        }
Пример #4
0
        public override void Build(InputIterator tfit)
        {
            if (tfit.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (tfit.Comparator != null)
            {
                // make sure it's unsorted
                // WTF - this could result in yet another sorted iteration....
                tfit = new UnsortedInputIterator(tfit);
            }
            if (tfit.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            count = 0;
            trie  = new JaspellTernarySearchTrie();
            trie.MatchAlmostDiff = editDistance;
            BytesRef spare;

            CharsRef charsSpare = new CharsRef();

            while ((spare = tfit.Next()) != null)
            {
                long weight = tfit.Weight;
                if (spare.Length == 0)
                {
                    continue;
                }
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                trie.Put(charsSpare.ToString(), Convert.ToInt64(weight));
            }
        }
Пример #5
0
        public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool higherWeightsFirst, int num)
        {
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            IList <FSTCompletion.Completion> completions;

            if (higherWeightsFirst)
            {
                completions = higherWeightsCompletion.DoLookup(key, num);
            }
            else
            {
                completions = normalCompletion.DoLookup(key, num);
            }

            List <LookupResult> results = new List <LookupResult>(completions.Count);
            CharsRef            spare   = new CharsRef();

            foreach (FSTCompletion.Completion c in completions)
            {
                spare.Grow(c.Utf8.Length);
                UnicodeUtil.UTF8toUTF16(c.Utf8, spare);
                results.Add(new LookupResult(spare.ToString(), c.Bucket));
            }
            return(results);
        }
Пример #6
0
 public virtual void TestAppendChars()
 {
     char[] chars = new char[] { 'a', 'b', 'c', 'd' };
     CharsRef c = new CharsRef(chars, 1, 3); // bcd
     c.Append(new char[] { 'e' }, 0, 1);
     Assert.AreEqual("bcde", c.ToString());
 }
Пример #7
0
        /// <summary>
        /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for the
        /// given selection of fields from terms with a document frequency greater than
        /// the given <paramref name="maxDocFreq"/>
        /// </summary>
        /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
        /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param>
        /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
        /// <param name="fields"> Selection of fields to calculate stopwords for </param>
        /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param>
        /// <exception cref="IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
        public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection <string> fields, int maxDocFreq)
            : base(@delegate.Strategy)
        {
            this.matchVersion = matchVersion;
            this.@delegate    = @delegate;

            foreach (string field in fields)
            {
                var      stopWords = new JCG.HashSet <string>();
                Terms    terms     = MultiFields.GetTerms(indexReader, field);
                CharsRef spare     = new CharsRef();
                if (terms != null)
                {
                    TermsEnum te = terms.GetEnumerator();
                    while (te.MoveNext())
                    {
                        if (te.DocFreq > maxDocFreq)
                        {
                            UnicodeUtil.UTF8toUTF16(te.Term, spare);
                            stopWords.Add(spare.ToString());
                        }
                    }
                }
                stopWordsPerField[field] = stopWords;
            }
        }
Пример #8
0
        public override void Build(IInputIterator tfit)
        {
            if (tfit.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (tfit.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            root = new TernaryTreeNode();
            // buffer first
#pragma warning disable 612, 618
            if (tfit.Comparer != BytesRef.UTF8SortedAsUTF16Comparer)
            {
                // make sure it's sorted and the comparer uses UTF16 sort order
                tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparer);
            }
#pragma warning restore 612, 618

            List <string> tokens = new List <string>();
            List <object> vals   = new List <object>();
            BytesRef      spare;
            CharsRef      charsSpare = new CharsRef();
            while ((spare = tfit.Next()) != null)
            {
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                tokens.Add(charsSpare.ToString());
                vals.Add(Convert.ToInt64(tfit.Weight));
            }
            autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root);
        }
Пример #9
0
 private void ReadField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor)
 {
     ReadLine();
     if (Debugging.AssertsEnabled)
     {
         Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextStoredFieldsWriter.VALUE));
     }
     if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_STRING))
     {
         visitor.StringField(fieldInfo,
                             Encoding.UTF8.GetString(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length,
                                                     _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length));
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_BINARY))
     {
         var copy = new byte[_scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length];
         Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, copy, 0, copy.Length);
         visitor.BinaryField(fieldInfo, copy);
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_INT))
     {
         UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length,
                                 _scratchUtf16);
         visitor.Int32Field(fieldInfo, J2N.Numerics.Int32.Parse(_scratchUtf16.ToString(), NumberFormatInfo.InvariantInfo));
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_LONG))
     {
         UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length,
                                 _scratchUtf16);
         visitor.Int64Field(fieldInfo, J2N.Numerics.Int64.Parse(_scratchUtf16.ToString(), NumberFormatInfo.InvariantInfo));
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_FLOAT))
     {
         UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length,
                                 _scratchUtf16);
         visitor.SingleField(fieldInfo, J2N.Numerics.Single.Parse(_scratchUtf16.ToString(), NumberStyle.Float, NumberFormatInfo.InvariantInfo));
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_DOUBLE))
     {
         UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length,
                                 _scratchUtf16);
         visitor.DoubleField(fieldInfo, J2N.Numerics.Double.Parse(_scratchUtf16.ToString(), NumberStyle.Float, NumberFormatInfo.InvariantInfo));
     }
 }
Пример #10
0
 private void ReadField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor)
 {
     ReadLine();
     Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextStoredFieldsWriter.VALUE));
     if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_STRING))
     {
         visitor.StringField(fieldInfo,
                             Encoding.UTF8.GetString(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length,
                                                     _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length));
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_BINARY))
     {
         var copy = new byte[_scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length];
         Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, copy, 0, copy.Length);
         visitor.BinaryField(fieldInfo, copy);
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_INT))
     {
         UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length,
                                 _scratchUtf16);
         visitor.Int32Field(fieldInfo, Convert.ToInt32(_scratchUtf16.ToString(), CultureInfo.InvariantCulture));
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_LONG))
     {
         UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length,
                                 _scratchUtf16);
         visitor.Int64Field(fieldInfo, Convert.ToInt64(_scratchUtf16.ToString(), CultureInfo.InvariantCulture));
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_FLOAT))
     {
         UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length,
                                 _scratchUtf16);
         visitor.SingleField(fieldInfo, Convert.ToSingle(_scratchUtf16.ToString(), CultureInfo.InvariantCulture));
     }
     else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_DOUBLE))
     {
         UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length,
                                 _scratchUtf16);
         visitor.DoubleField(fieldInfo, Convert.ToDouble(_scratchUtf16.ToString(), CultureInfo.InvariantCulture));
     }
 }
Пример #11
0
        public virtual void TestAppend()
        {
            CharsRef @ref = new CharsRef();
            StringBuilder builder = new StringBuilder();
            int numStrings = AtLeast(10);
            for (int i = 0; i < numStrings; i++)
            {
                char[] charArray = TestUtil.RandomRealisticUnicodeString(Random(), 1, 100).ToCharArray();
                int offset = Random().Next(charArray.Length);
                int length = charArray.Length - offset;
                builder.Append(charArray, offset, length);
                @ref.Append(charArray, offset, length);
            }

            Assert.AreEqual(builder.ToString(), @ref.ToString());
        }
Пример #12
0
        /// <summary>Convert to lowercase in-place.</summary>
        private string ToLowercase(string chs)
        {
            int length = chs.Length;

            scratch.Length = length;
            scratch.Grow(length);

            char[] buffer = scratch.Chars;
            for (int i = 0; i < length;)
            {
                i += Character.ToChars(
                    Character.ToLower(Character.CodePointAt(chs, i), culture), buffer, i); // LUCENENET specific - need to use explicit culture to override current thread
            }

            return(scratch.ToString());
        }
Пример #13
0
        /// <summary>Convert to lowercase in-place.</summary>
        private string ToLowercase(string chs)
        {
            int length = chs.Length;

            scratch.Length = length;
            scratch.Grow(length);

            char[] buffer = scratch.Chars;
            for (int i = 0; i < length;)
            {
                i += Character.ToChars(
                    Character.ToLower(Character.CodePointAt(chs, i)), buffer, i);
            }

            return(scratch.ToString());
        }
Пример #14
0
        public override void Build(IInputEnumerator enumerator)
        {
            // LUCENENET: Added guard clause for null
            if (enumerator is null)
            {
                throw new ArgumentNullException(nameof(enumerator));
            }

            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.Comparer != null)
            {
                // make sure it's unsorted
                // WTF - this could result in yet another sorted iteration....
                enumerator = new UnsortedInputEnumerator(enumerator);
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            count = 0;
            trie  = new JaspellTernarySearchTrie {
                MatchAlmostDiff = editDistance
            };
            BytesRef spare;

            var charsSpare = new CharsRef();

            while (enumerator.MoveNext())
            {
                spare = enumerator.Current;
                long weight = enumerator.Weight;
                if (spare.Length == 0)
                {
                    continue;
                }
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                trie.Put(charsSpare.ToString(), weight);
            }
        }
Пример #15
0
        public override void Build(IInputEnumerator enumerator)
        {
            // LUCENENT: Added guard clause for null
            if (enumerator is null)
            {
                throw new ArgumentNullException(nameof(enumerator));
            }

            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            root = new TernaryTreeNode();
            // buffer first
#pragma warning disable 612, 618
            if (enumerator.Comparer != BytesRef.UTF8SortedAsUTF16Comparer)
            {
                // make sure it's sorted and the comparer uses UTF16 sort order
                enumerator = new SortedInputEnumerator(enumerator, BytesRef.UTF8SortedAsUTF16Comparer);
            }
#pragma warning restore 612, 618

            JCG.List <string> tokens = new JCG.List <string>();
            JCG.List <object> vals   = new JCG.List <object>();
            BytesRef          spare;
            CharsRef          charsSpare = new CharsRef();
            while (enumerator.MoveNext())
            {
                spare = enumerator.Current;
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                tokens.Add(charsSpare.ToString());
                vals.Add(enumerator.Weight);
            }
            autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root);
        }
Пример #16
0
        /// <summary>
        /// Adds terms and frequencies found in vector into the Map termFreqMap
        /// </summary>
        /// <param name="termFreqMap"> a Map of terms and their frequencies </param>
        /// <param name="vector"> List of terms and their frequencies for a doc/field </param>
        private void AddTermFrequencies(IDictionary <string, Int> termFreqMap, Terms vector)
        {
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = vector.iterator(null);
            TermsEnum termsEnum = vector.Iterator(null);
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final org.apache.lucene.util.CharsRef spare = new org.apache.lucene.util.CharsRef();
            CharsRef spare = new CharsRef();
            BytesRef text;

            while ((text = termsEnum.Next()) != null)
            {
                UnicodeUtil.UTF8toUTF16(text, spare);
                //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
                //ORIGINAL LINE: final String term = spare.toString();
                string term = spare.ToString();
                if (IsNoiseWord(term))
                {
                    continue;
                }
                //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
                //ORIGINAL LINE: final int freq = (int) termsEnum.totalTermFreq();
                int freq = (int)termsEnum.TotalTermFreq();

                // increment frequency
                Int cnt = termFreqMap[term];
                if (cnt == null)
                {
                    cnt = new Int();
                    termFreqMap[term] = cnt;
                    cnt.x             = freq;
                }
                else
                {
                    cnt.x += freq;
                }
            }
        }
Пример #17
0
        //public static void main( string[] args ) throws Exception {
        //  Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
        //  QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,  "f", analyzer );
        //  Query query = parser.parse( "a x:b" );
        //  FieldQuery fieldQuery = new FieldQuery( query, true, false );

        //  Directory dir = new RAMDirectory();
        //  IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer));
        //  Document doc = new Document();
        //  IndexableFieldType ft = new IndexableFieldType(TextField.TYPE_STORED);
        //  ft.setStoreTermVectors(true);
        //  ft.setStoreTermVectorOffsets(true);
        //  ft.setStoreTermVectorPositions(true);
        //  doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) );
        //  doc.add( new Field( "f", ft, "b a b a f" ) );
        //  writer.addDocument( doc );
        //  writer.close();

        //  IndexReader reader = IndexReader.open(dir1);
        //  new FieldTermStack( reader, 0, "f", fieldQuery );
        //  reader.close();
        //}

        /// <summary>
        /// a constructor.
        /// </summary>
        /// <param name="reader"><see cref="IndexReader"/> of the index</param>
        /// <param name="docId">document id to be highlighted</param>
        /// <param name="fieldName">field of the document to be highlighted</param>
        /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param>
        /// <exception cref="IOException">If there is a low-level I/O error</exception>
        public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery)
        {
            this.fieldName = fieldName;

            ISet <string> termSet = fieldQuery.GetTermSet(fieldName);

            // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
            if (termSet == null)
            {
                return;
            }

            Fields vectors = reader.GetTermVectors(docId);

            if (vectors == null)
            {
                // null snippet
                return;
            }

            Terms vector = vectors.GetTerms(fieldName);

            if (vector == null)
            {
                // null snippet
                return;
            }

            CharsRef             spare     = new CharsRef();
            TermsEnum            termsEnum = vector.GetEnumerator();
            DocsAndPositionsEnum dpEnum    = null;
            BytesRef             text;

            int numDocs = reader.MaxDoc;

            while (termsEnum.MoveNext())
            {
                text = termsEnum.Term;
                UnicodeUtil.UTF8toUTF16(text, spare);
                string term = spare.ToString();
                if (!termSet.Contains(term))
                {
                    continue;
                }
                dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                if (dpEnum == null)
                {
                    // null snippet
                    return;
                }

                dpEnum.NextDoc();

                // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
                float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0);

                int freq = dpEnum.Freq;

                for (int i = 0; i < freq; i++)
                {
                    int pos = dpEnum.NextPosition();
                    if (dpEnum.StartOffset < 0)
                    {
                        return; // no offsets, null snippet
                    }
                    termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight));
                }
            }

            // sort by position
            CollectionUtil.TimSort(termList);

            // now look for dups at the same position, linking them together
            int      currentPos = -1;
            TermInfo previous   = null;
            TermInfo first      = null;

            for (int i = 0; i < termList.Count;)
            {
                TermInfo current = termList[i];
                if (current.Position == currentPos)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(previous != null);
                    }
                    previous.SetNext(current);
                    previous = current;
                    //iterator.Remove();

                    // LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item)
                    termList.RemoveAt(i);
                }
                else
                {
                    if (previous != null)
                    {
                        previous.SetNext(first);
                    }
                    previous   = first = current;
                    currentPos = current.Position;

                    // LUCENENET NOTE: Only increment the position if we don't do a delete.
                    i++;
                }
            }

            if (previous != null)
            {
                previous.SetNext(first);
            }
        }
Пример #18
0
 public virtual void TestCopy()
 {
     int numIters = AtLeast(10);
     for (int i = 0; i < numIters; i++)
     {
         CharsRef @ref = new CharsRef();
         char[] charArray = TestUtil.RandomRealisticUnicodeString(Random(), 1, 100).ToCharArray();
         int offset = Random().Next(charArray.Length);
         int length = charArray.Length - offset;
         string str = new string(charArray, offset, length);
         @ref.CopyChars(charArray, offset, length);
         Assert.AreEqual(str, @ref.ToString());
     }
 }
Пример #19
0
        public override void Build(InputIterator tfit)
        {
            if (tfit.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (tfit.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            root = new TernaryTreeNode();
            // buffer first
            if (tfit.Comparator != BytesRef.UTF8SortedAsUTF16Comparator)
            {
                // make sure it's sorted and the comparator uses UTF16 sort order
                tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparator);
            }

            List<string> tokens = new List<string>();
            List<Number> vals = new List<Number>();
            BytesRef spare;
            CharsRef charsSpare = new CharsRef();
            while ((spare = tfit.Next()) != null)
            {
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                tokens.Add(charsSpare.ToString());
                vals.Add(Convert.ToInt64(tfit.Weight));
            }
            autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root);
        }
 private string ReadString(int offset, BytesRef scratch)
 {
     UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + offset, scratch.Length - offset, _scratchUtf16);
     return(_scratchUtf16.ToString());
 }
Пример #21
0
 public virtual void TestCopyCharsRef()
 {
     char[] chars = new char[] { 'a', 'b', 'c', 'd' };
     CharsRef c = new CharsRef(chars, 1, 3); // bcd
     char[] otherchars = new char[] { 'b', 'c', 'd', 'e' };
     c.CopyChars(new CharsRef(otherchars, 0, 4));
     Assert.AreEqual("bcde", c.ToString());
 }
Пример #22
0
        public override void Build(IInputIterator tfit)
        {
            if (tfit.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (tfit.Comparator != null)
            {
                // make sure it's unsorted
                // WTF - this could result in yet another sorted iteration....
                tfit = new UnsortedInputIterator(tfit);
            }
            if (tfit.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            count = 0;
            trie = new JaspellTernarySearchTrie { MatchAlmostDiff = editDistance };
            BytesRef spare;

            var charsSpare = new CharsRef();

            while ((spare = tfit.Next()) != null)
            {

                long weight = tfit.Weight;
                if (spare.Length == 0)
                {
                    continue;
                }
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                trie.Put(charsSpare.ToString(), Convert.ToInt64(weight));
            }
        }
Пример #23
0
        public override void Build(IInputIterator tfit)
        {
            if (tfit.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (tfit.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            root = new TernaryTreeNode();
            // buffer first
            #pragma warning disable 612, 618
            if (tfit.Comparator != BytesRef.UTF8SortedAsUTF16Comparer)
            {
                // make sure it's sorted and the comparator uses UTF16 sort order
                tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparer);
            }
            #pragma warning restore 612, 618

            List<string> tokens = new List<string>();
            List<object> vals = new List<object>(); // LUCENENET TODO: Should this be long? in Java it was Number, but we can probably do better than object
            BytesRef spare;
            CharsRef charsSpare = new CharsRef();
            while ((spare = tfit.Next()) != null)
            {
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                tokens.Add(charsSpare.ToString());
                vals.Add(Convert.ToInt64(tfit.Weight));
            }
            autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root);
        }
Пример #24
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString());

            try
            {
                ITermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                IOffsetAttribute            offsetAtt    = ts.AddAttribute <IOffsetAttribute>();
                IPositionLengthAttribute    posLenAtt    = ts.AddAttribute <IPositionLengthAttribute>();
                IPositionIncrementAttribute posIncAtt    = ts.AddAttribute <IPositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset);

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.GetBytesReader();

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                List <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    try
                    {
                        prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(),
                                                               // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                                               // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                                               (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    backoff *= ALPHA;
                }

                results.Sort(new ComparerAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Пример #25
0
 public virtual void TestUTF8UTF16CharsRef()
 {
     int num = AtLeast(3989);
     for (int i = 0; i < num; i++)
     {
         string unicode = TestUtil.RandomRealisticUnicodeString(Random());
         BytesRef @ref = new BytesRef(unicode);
         char[] arr = new char[1 + Random().Next(100)];
         int offset = Random().Next(arr.Length);
         int len = Random().Next(arr.Length - offset);
         CharsRef cRef = new CharsRef(arr, offset, len);
         UnicodeUtil.UTF8toUTF16(@ref, cRef);
         Assert.AreEqual(cRef.ToString(), unicode);
     }
 }
Пример #26
0
        /// <summary>
        /// Suggest similar words.
        ///
        /// <para>
        /// Unlike <see cref="SpellChecker"/>, the similarity used to fetch the most
        /// relevant terms is an edit distance, therefore typically a low value
        /// for numSug will work very well.
        /// </para>
        /// </summary>
        /// <param name="term"> Term you want to spell check on </param>
        /// <param name="numSug"> the maximum number of suggested words </param>
        /// <param name="ir"> IndexReader to find terms from </param>
        /// <param name="suggestMode"> specifies when to return suggested words </param>
        /// <param name="accuracy"> return only suggested words that match with this similarity </param>
        /// <returns> sorted list of the suggested words according to the comparer </returns>
        /// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception>
        public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir,
                                                    SuggestMode suggestMode, float accuracy)
        {
            CharsRef spare = new CharsRef();
            string   text  = term.Text();

            if (minQueryLength > 0 && text.CodePointCount(0, text.Length) < minQueryLength)
            {
                return(new SuggestWord[0]);
            }

            if (lowerCaseTerms)
            {
                term = new Term(term.Field, text.ToLower());
            }

            int docfreq = ir.DocFreq(term);

            if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0)
            {
                return(new SuggestWord[0]);
            }

            int maxDoc = ir.MaxDoc;

            if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency)
            {
                return(new SuggestWord[0]);
            }
            else if (docfreq > (int)Math.Ceiling(maxQueryFrequency * maxDoc))
            {
                return(new SuggestWord[0]);
            }

            if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR)
            {
                docfreq = 0;
            }

            if (thresholdFrequency >= 1f)
            {
                docfreq = Math.Max(docfreq, (int)thresholdFrequency);
            }
            else if (thresholdFrequency > 0f)
            {
                docfreq = Math.Max(docfreq, (int)(thresholdFrequency * maxDoc) - 1);
            }

            IEnumerable <ScoreTerm> terms = null;
            int inspections = numSug * maxInspections;

            // try ed=1 first, in case we get lucky
            terms = SuggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
            if (maxEdits > 1 && terms.Count() < inspections)
            {
                var moreTerms = new HashSet <ScoreTerm>();
                moreTerms.AddAll(terms);
                moreTerms.AddAll(SuggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
                terms = moreTerms;
            }

            // create the suggestword response, sort it, and trim it to size.

            var suggestions = new SuggestWord[terms.Count()];
            int index       = suggestions.Length - 1;

            foreach (ScoreTerm s in terms)
            {
                SuggestWord suggestion = new SuggestWord();
                if (s.TermAsString == null)
                {
                    UnicodeUtil.UTF8toUTF16(s.Term, spare);
                    s.TermAsString = spare.ToString();
                }
                suggestion.String    = s.TermAsString;
                suggestion.Score     = s.Score;
                suggestion.Freq      = s.Docfreq;
                suggestions[index--] = suggestion;
            }

            ArrayUtil.TimSort(suggestions, Collections.ReverseOrder(comparer));
            if (numSug < suggestions.Length)
            {
                SuggestWord[] trimmed = new SuggestWord[numSug];
                Array.Copy(suggestions, 0, trimmed, 0, numSug);
                suggestions = trimmed;
            }
            return(suggestions);
        }
Пример #27
0
        public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num)
        {
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(num > 0);
            }

            if (onlyMorePopular)
            {
                throw new ArgumentException("this suggester only works with onlyMorePopular=false");
            }

            if (fst == null)
            {
                return(Collections.EmptyList <LookupResult>());
            }

            BytesRef scratch      = new BytesRef(key);
            int      prefixLength = scratch.Length;

            FST.Arc <long?> arc = new FST.Arc <long?>();

            // match the prefix portion exactly
            long?prefixOutput = null;

            try
            {
                prefixOutput = LookupPrefix(scratch, arc);
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus.ToString(), bogus);
            }

            if (prefixOutput == null)
            {
                return(Collections.EmptyList <LookupResult>());
            }

            List <LookupResult> results = new List <LookupResult>(num);
            CharsRef            spare   = new CharsRef();

            if (exactFirst && arc.IsFinal)
            {
                spare.Grow(scratch.Length);
                UnicodeUtil.UTF8toUTF16(scratch, spare);
                results.Add(new LookupResult(spare.ToString(), DecodeWeight(prefixOutput.GetValueOrDefault() + arc.NextFinalOutput.GetValueOrDefault())));
                if (--num == 0)
                {
                    return(results); // that was quick
                }
            }

            // complete top-N
            Util.Fst.Util.TopResults <long?> completions = null;
            try
            {
                completions = Lucene.Net.Util.Fst.Util.ShortestPaths(fst, arc, prefixOutput, weightComparer, num, !exactFirst);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(completions.IsComplete);
                }
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus.ToString(), bogus);
            }

            BytesRef suffix = new BytesRef(8);

            foreach (Util.Fst.Util.Result <long?> completion in completions)
            {
                scratch.Length = prefixLength;
                // append suffix
                Lucene.Net.Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                scratch.Append(suffix);
                spare.Grow(scratch.Length);
                UnicodeUtil.UTF8toUTF16(scratch, spare);
                results.Add(new LookupResult(spare.ToString(), DecodeWeight(completion.Output.GetValueOrDefault())));
            }
            return(results);
        }
        public override List<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool higherWeightsFirst, int num)
        {
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            IList<FSTCompletion.Completion> completions;
            if (higherWeightsFirst)
            {
                completions = higherWeightsCompletion.DoLookup(key, num);
            }
            else
            {
                completions = normalCompletion.DoLookup(key, num);
            }

            List<LookupResult> results = new List<LookupResult>(completions.Count);
            CharsRef spare = new CharsRef();
            foreach (FSTCompletion.Completion c in completions)
            {
                spare.Grow(c.utf8.Length);
                UnicodeUtil.UTF8toUTF16(c.utf8, spare);
                results.Add(new LookupResult(spare.ToString(), c.bucket));
            }
            return results;
        }
Пример #29
0
            /// <summary>
            /// Builds an <see cref="SynonymMap"/> and returns it.
            /// </summary>
            public virtual SynonymMap Build()
            {
                ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
                // TODO: are we using the best sharing options?
                var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

                BytesRef            scratch       = new BytesRef(64);
                ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

                HashSet <int?> dedupSet;

                if (dedup)
                {
                    dedupSet = new HashSet <int?>();
                }
                else
                {
                    dedupSet = null;
                }


                var spare = new byte[5];

                ICollection <CharsRef> keys = workingSet.Keys;

                CharsRef[] sortedKeys = keys.ToArray();
#pragma warning disable 612, 618
                System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer);
#pragma warning restore 612, 618


                Int32sRef scratchIntsRef = new Int32sRef();

                //System.out.println("fmap.build");
                for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
                {
                    CharsRef input  = sortedKeys[keyIdx];
                    MapEntry output = workingSet[input];

                    int numEntries = output.ords.Count;
                    // output size, assume the worst case
                    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

                    scratch.Grow(estimatedSize);
                    scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length);
                    Debug.Assert(scratch.Offset == 0);

                    // now write our output data:
                    int count = 0;
                    for (int i = 0; i < numEntries; i++)
                    {
                        if (dedupSet != null)
                        {
                            // box once
                            int?ent = output.ords[i];
                            if (dedupSet.Contains(ent))
                            {
                                continue;
                            }
                            dedupSet.Add(ent);
                        }
                        scratchOutput.WriteVInt32(output.ords[i]);
                        count++;
                    }

                    int pos = scratchOutput.Position;
                    scratchOutput.WriteVInt32(count << 1 | (output.includeOrig ? 0 : 1));
                    int pos2    = scratchOutput.Position;
                    int vIntLen = pos2 - pos;

                    // Move the count + includeOrig to the front of the byte[]:
                    Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen);
                    Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos);
                    Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen);

                    if (dedupSet != null)
                    {
                        dedupSet.Clear();
                    }

                    scratch.Length = scratchOutput.Position - scratch.Offset;
                    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
                    builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch));
                }

                FST <BytesRef> fst = builder.Finish();
                return(new SynonymMap(fst, words, maxHorizontalContext));
            }
        /// <summary>
        /// Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
        /// given selection of fields from terms with a document frequency greater than
        /// the given maxDocFreq
        /// </summary>
        /// <param name="matchVersion"> Version to be used in <seealso cref="StopFilter"/> </param>
        /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param>
        /// <param name="indexReader"> IndexReader to identify the stopwords from </param>
        /// <param name="fields"> Selection of fields to calculate stopwords for </param>
        /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param>
        /// <exception cref="IOException"> Can be thrown while reading from the IndexReader </exception>
        public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection<string> fields, int maxDocFreq)
            : base(@delegate.Strategy)
        {
            this.matchVersion = matchVersion;
            this.@delegate = @delegate;

            foreach (string field in fields)
            {
                var stopWords = new HashSet<string>();
                Terms terms = MultiFields.GetTerms(indexReader, field);
                CharsRef spare = new CharsRef();
                if (terms != null)
                {
                    TermsEnum te = terms.Iterator(null);
                    BytesRef text;
                    while ((text = te.Next()) != null)
                    {
                        if (te.DocFreq() > maxDocFreq)
                        {
                            UnicodeUtil.UTF8toUTF16(text, spare);
                            stopWords.Add(spare.ToString());
                        }
                    }
                }
                stopWordsPerField[field] = stopWords;
            }
        }
Пример #31
0
        /// <summary>
        /// Provide spelling corrections based on several parameters.
        /// </summary>
        /// <param name="term"> The term to suggest spelling corrections for </param>
        /// <param name="numSug"> The maximum number of spelling corrections </param>
        /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param>
        /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param>
        /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param>
        /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param>
        /// <param name="spare"> a chars scratch </param>
        /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns>
        /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception>
        protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir,
                                                                          int docfreq, int editDistance, float accuracy, CharsRef spare)
        {
            var atts = new AttributeSource();
            IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
            Terms terms = MultiFields.GetTerms(ir, term.Field);

            if (terms == null)
            {
                return(new List <ScoreTerm>());
            }
            FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true);

            var stQueue = new Support.PriorityQueue <ScoreTerm>();

            BytesRef        queryTerm = new BytesRef(term.Text());
            BytesRef        candidateTerm;
            ScoreTerm       st       = new ScoreTerm();
            IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>();

            while ((candidateTerm = e.Next()) != null)
            {
                float boost = boostAtt.Boost;
                // ignore uncompetitive hits
                if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost)
                {
                    continue;
                }

                // ignore exact match of the same term
                if (queryTerm.BytesEquals(candidateTerm))
                {
                    continue;
                }

                int df = e.DocFreq;

                // check docFreq if required
                if (df <= docfreq)
                {
                    continue;
                }

                float  score;
                string termAsString;
                if (distance == INTERNAL_LEVENSHTEIN)
                {
                    // delay creating strings until the end
                    termAsString = null;
                    // undo FuzzyTermsEnum's scale factor for a real scaled lev score
                    score = boost / e.ScaleFactor + e.MinSimilarity;
                }
                else
                {
                    UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
                    termAsString = spare.ToString();
                    score        = distance.GetDistance(term.Text(), termAsString);
                }

                if (score < accuracy)
                {
                    continue;
                }

                // add new entry in PQ
                st.Term         = BytesRef.DeepCopyOf(candidateTerm);
                st.Boost        = boost;
                st.Docfreq      = df;
                st.TermAsString = termAsString;
                st.Score        = score;
                stQueue.Offer(st);
                // possibly drop entries from queue
                st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm();
                maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity;
            }

            return(stQueue);
        }
Пример #32
0
        public override IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, bool onlyMorePopular, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            Debug.Assert(num > 0);

            if (onlyMorePopular)
            {
                throw new System.ArgumentException("this suggester only works with onlyMorePopular=false");
            }

            if (fst == null)
            {
                return(Collections.emptyList());
            }

            BytesRef    scratch      = new BytesRef(key);
            int         prefixLength = scratch.Length;
            Arc <long?> arc          = new Arc <long?>();

            // match the prefix portion exactly
            long?prefixOutput = null;

            try
            {
                prefixOutput = LookupPrefix(scratch, arc);
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus);
            }

            if (prefixOutput == null)
            {
                return(Collections.emptyList());
            }

            IList <LookupResult> results = new List <LookupResult>(num);
            CharsRef             spare   = new CharsRef();

            if (exactFirst && arc.Final)
            {
                spare.grow(scratch.length);
                UnicodeUtil.UTF8toUTF16(scratch, spare);
                results.Add(new LookupResult(spare.ToString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
                if (--num == 0)
                {
                    return(results); // that was quick
                }
            }

            // complete top-N
            TopResults <long?> completions = null;

            try
            {
                completions = Util.ShortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst);
                Debug.Assert(completions.isComplete);
            }
            catch (IOException bogus)
            {
                throw new Exception(bogus);
            }

            BytesRef suffix = new BytesRef(8);

            foreach (Result <long?> completion in completions)
            {
                scratch.length = prefixLength;
                // append suffix
                Util.ToBytesRef(completion.input, suffix);
                scratch.Append(suffix);
                spare.Grow(scratch.Length);
                UnicodeUtil.UTF8toUTF16(scratch, spare);
                results.Add(new LookupResult(spare.ToString(), decodeWeight(completion.output)));
            }
            return(results);
        }