private LookupResult GetLookupResult(long?output1, BytesRef output2, CharsRef spare) { LookupResult result; if (hasPayloads) { int sepIndex = -1; for (int i = 0; i < output2.Length; i++) { if (output2.Bytes[output2.Offset + i] == PAYLOAD_SEP) { sepIndex = i; break; } } Debug.Assert(sepIndex != -1); spare.Grow(sepIndex); int payloadLen = output2.Length - sepIndex - 1; UnicodeUtil.UTF8toUTF16(output2.Bytes, output2.Offset, sepIndex, spare); BytesRef payload = new BytesRef(payloadLen); Array.Copy(output2.Bytes, sepIndex + 1, payload.Bytes, 0, payloadLen); payload.Length = payloadLen; result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()), payload); } else { spare.Grow(output2.Length); UnicodeUtil.UTF8toUTF16(output2, spare); result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault())); } return(result); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private void readField(util.BytesRef type, index.FieldInfo fieldInfo, index.StoredFieldVisitor visitor) throws java.io.IOException private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor) { readLine(); Debug.Assert(StringHelper.StartsWith(scratch, VALUE)); if (type == TYPE_STRING) { visitor.stringField(fieldInfo, new string(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, StandardCharsets.UTF_8)); } else if (type == TYPE_BINARY) { sbyte[] copy = new sbyte[scratch.length - VALUE.length]; Array.Copy(scratch.bytes, scratch.offset + VALUE.length, copy, 0, copy.Length); visitor.binaryField(fieldInfo, copy); } else if (type == TYPE_INT) { UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16); visitor.intField(fieldInfo, Convert.ToInt32(scratchUTF16.ToString())); } else if (type == TYPE_LONG) { UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16); visitor.longField(fieldInfo, Convert.ToInt64(scratchUTF16.ToString())); } else if (type == TYPE_FLOAT) { UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16); visitor.floatField(fieldInfo, Convert.ToSingle(scratchUTF16.ToString())); } else if (type == TYPE_DOUBLE) { UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16); visitor.doubleField(fieldInfo, Convert.ToDouble(scratchUTF16.ToString())); } }
/// <summary> /// Adds terms and frequencies found in vector into the <see cref="T:IDictionary{string, Int}"/> <paramref name="termFreqMap"/> /// </summary> /// <param name="termFreqMap"> a <see cref="T:IDictionary{string, Int}"/> of terms and their frequencies </param> /// <param name="vector"> List of terms and their frequencies for a doc/field </param> private void AddTermFrequencies(IDictionary <string, Int32> termFreqMap, Terms vector) { var termsEnum = vector.GetIterator(null); var spare = new CharsRef(); BytesRef text; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); var term = spare.ToString(); if (IsNoiseWord(term)) { continue; } var freq = (int)termsEnum.TotalTermFreq; // increment frequency Int32 cnt; if (!termFreqMap.TryGetValue(term, out cnt)) { cnt = new Int32(); termFreqMap[term] = cnt; cnt.x = freq; } else { cnt.x += freq; } } }
public override void Build(InputIterator tfit) { if (tfit.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (tfit.Comparator != null) { // make sure it's unsorted // WTF - this could result in yet another sorted iteration.... tfit = new UnsortedInputIterator(tfit); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } count = 0; trie = new JaspellTernarySearchTrie(); trie.MatchAlmostDiff = editDistance; BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { long weight = tfit.Weight; if (spare.Length == 0) { continue; } charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); trie.Put(charsSpare.ToString(), Convert.ToInt64(weight)); } }
public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool higherWeightsFirst, int num) { if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } IList <FSTCompletion.Completion> completions; if (higherWeightsFirst) { completions = higherWeightsCompletion.DoLookup(key, num); } else { completions = normalCompletion.DoLookup(key, num); } List <LookupResult> results = new List <LookupResult>(completions.Count); CharsRef spare = new CharsRef(); foreach (FSTCompletion.Completion c in completions) { spare.Grow(c.Utf8.Length); UnicodeUtil.UTF8toUTF16(c.Utf8, spare); results.Add(new LookupResult(spare.ToString(), c.Bucket)); } return(results); }
public virtual void TestAppendChars() { char[] chars = new char[] { 'a', 'b', 'c', 'd' }; CharsRef c = new CharsRef(chars, 1, 3); // bcd c.Append(new char[] { 'e' }, 0, 1); Assert.AreEqual("bcde", c.ToString()); }
/// <summary> /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for the /// given selection of fields from terms with a document frequency greater than /// the given <paramref name="maxDocFreq"/> /// </summary> /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param> /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param> /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param> /// <param name="fields"> Selection of fields to calculate stopwords for </param> /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param> /// <exception cref="IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception> public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection <string> fields, int maxDocFreq) : base(@delegate.Strategy) { this.matchVersion = matchVersion; this.@delegate = @delegate; foreach (string field in fields) { var stopWords = new JCG.HashSet <string>(); Terms terms = MultiFields.GetTerms(indexReader, field); CharsRef spare = new CharsRef(); if (terms != null) { TermsEnum te = terms.GetEnumerator(); while (te.MoveNext()) { if (te.DocFreq > maxDocFreq) { UnicodeUtil.UTF8toUTF16(te.Term, spare); stopWords.Add(spare.ToString()); } } } stopWordsPerField[field] = stopWords; } }
public override void Build(IInputIterator tfit) { if (tfit.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } root = new TernaryTreeNode(); // buffer first #pragma warning disable 612, 618 if (tfit.Comparer != BytesRef.UTF8SortedAsUTF16Comparer) { // make sure it's sorted and the comparer uses UTF16 sort order tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparer); } #pragma warning restore 612, 618 List <string> tokens = new List <string>(); List <object> vals = new List <object>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); tokens.Add(charsSpare.ToString()); vals.Add(Convert.ToInt64(tfit.Weight)); } autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root); }
private void ReadField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor) { ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextStoredFieldsWriter.VALUE)); } if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_STRING)) { visitor.StringField(fieldInfo, Encoding.UTF8.GetString(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_BINARY)) { var copy = new byte[_scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length]; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, copy, 0, copy.Length); visitor.BinaryField(fieldInfo, copy); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_INT)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.Int32Field(fieldInfo, J2N.Numerics.Int32.Parse(_scratchUtf16.ToString(), NumberFormatInfo.InvariantInfo)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_LONG)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.Int64Field(fieldInfo, J2N.Numerics.Int64.Parse(_scratchUtf16.ToString(), NumberFormatInfo.InvariantInfo)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_FLOAT)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.SingleField(fieldInfo, J2N.Numerics.Single.Parse(_scratchUtf16.ToString(), NumberStyle.Float, NumberFormatInfo.InvariantInfo)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_DOUBLE)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.DoubleField(fieldInfo, J2N.Numerics.Double.Parse(_scratchUtf16.ToString(), NumberStyle.Float, NumberFormatInfo.InvariantInfo)); } }
private void ReadField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextStoredFieldsWriter.VALUE)); if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_STRING)) { visitor.StringField(fieldInfo, Encoding.UTF8.GetString(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_BINARY)) { var copy = new byte[_scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length]; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, copy, 0, copy.Length); visitor.BinaryField(fieldInfo, copy); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_INT)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.Int32Field(fieldInfo, Convert.ToInt32(_scratchUtf16.ToString(), CultureInfo.InvariantCulture)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_LONG)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.Int64Field(fieldInfo, Convert.ToInt64(_scratchUtf16.ToString(), CultureInfo.InvariantCulture)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_FLOAT)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.SingleField(fieldInfo, Convert.ToSingle(_scratchUtf16.ToString(), CultureInfo.InvariantCulture)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_DOUBLE)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.DoubleField(fieldInfo, Convert.ToDouble(_scratchUtf16.ToString(), CultureInfo.InvariantCulture)); } }
public virtual void TestAppend() { CharsRef @ref = new CharsRef(); StringBuilder builder = new StringBuilder(); int numStrings = AtLeast(10); for (int i = 0; i < numStrings; i++) { char[] charArray = TestUtil.RandomRealisticUnicodeString(Random(), 1, 100).ToCharArray(); int offset = Random().Next(charArray.Length); int length = charArray.Length - offset; builder.Append(charArray, offset, length); @ref.Append(charArray, offset, length); } Assert.AreEqual(builder.ToString(), @ref.ToString()); }
/// <summary>Convert to lowercase in-place.</summary> private string ToLowercase(string chs) { int length = chs.Length; scratch.Length = length; scratch.Grow(length); char[] buffer = scratch.Chars; for (int i = 0; i < length;) { i += Character.ToChars( Character.ToLower(Character.CodePointAt(chs, i), culture), buffer, i); // LUCENENET specific - need to use explicit culture to override current thread } return(scratch.ToString()); }
/// <summary>Convert to lowercase in-place.</summary> private string ToLowercase(string chs) { int length = chs.Length; scratch.Length = length; scratch.Grow(length); char[] buffer = scratch.Chars; for (int i = 0; i < length;) { i += Character.ToChars( Character.ToLower(Character.CodePointAt(chs, i)), buffer, i); } return(scratch.ToString()); }
public override void Build(IInputEnumerator enumerator) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.Comparer != null) { // make sure it's unsorted // WTF - this could result in yet another sorted iteration.... enumerator = new UnsortedInputEnumerator(enumerator); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } count = 0; trie = new JaspellTernarySearchTrie { MatchAlmostDiff = editDistance }; BytesRef spare; var charsSpare = new CharsRef(); while (enumerator.MoveNext()) { spare = enumerator.Current; long weight = enumerator.Weight; if (spare.Length == 0) { continue; } charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); trie.Put(charsSpare.ToString(), weight); } }
public override void Build(IInputEnumerator enumerator) { // LUCENENT: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } root = new TernaryTreeNode(); // buffer first #pragma warning disable 612, 618 if (enumerator.Comparer != BytesRef.UTF8SortedAsUTF16Comparer) { // make sure it's sorted and the comparer uses UTF16 sort order enumerator = new SortedInputEnumerator(enumerator, BytesRef.UTF8SortedAsUTF16Comparer); } #pragma warning restore 612, 618 JCG.List <string> tokens = new JCG.List <string>(); JCG.List <object> vals = new JCG.List <object>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while (enumerator.MoveNext()) { spare = enumerator.Current; charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); tokens.Add(charsSpare.ToString()); vals.Add(enumerator.Weight); } autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root); }
/// <summary> /// Adds terms and frequencies found in vector into the Map termFreqMap /// </summary> /// <param name="termFreqMap"> a Map of terms and their frequencies </param> /// <param name="vector"> List of terms and their frequencies for a doc/field </param> private void AddTermFrequencies(IDictionary <string, Int> termFreqMap, Terms vector) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = vector.iterator(null); TermsEnum termsEnum = vector.Iterator(null); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.CharsRef spare = new org.apache.lucene.util.CharsRef(); CharsRef spare = new CharsRef(); BytesRef text; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String term = spare.toString(); string term = spare.ToString(); if (IsNoiseWord(term)) { continue; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int freq = (int) termsEnum.totalTermFreq(); int freq = (int)termsEnum.TotalTermFreq(); // increment frequency Int cnt = termFreqMap[term]; if (cnt == null) { cnt = new Int(); termFreqMap[term] = cnt; cnt.x = freq; } else { cnt.x += freq; } } }
//public static void main( string[] args ) throws Exception { // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer ); // Query query = parser.parse( "a x:b" ); // FieldQuery fieldQuery = new FieldQuery( query, true, false ); // Directory dir = new RAMDirectory(); // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)); // Document doc = new Document(); // IndexableFieldType ft = new IndexableFieldType(TextField.TYPE_STORED); // ft.setStoreTermVectors(true); // ft.setStoreTermVectorOffsets(true); // ft.setStoreTermVectorPositions(true); // doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) ); // doc.add( new Field( "f", ft, "b a b a f" ) ); // writer.addDocument( doc ); // writer.close(); // IndexReader reader = IndexReader.open(dir1); // new FieldTermStack( reader, 0, "f", fieldQuery ); // reader.close(); //} /// <summary> /// a constructor. /// </summary> /// <param name="reader"><see cref="IndexReader"/> of the index</param> /// <param name="docId">document id to be highlighted</param> /// <param name="fieldName">field of the document to be highlighted</param> /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param> /// <exception cref="IOException">If there is a low-level I/O error</exception> public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery) { this.fieldName = fieldName; ISet <string> termSet = fieldQuery.GetTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) { return; } Fields vectors = reader.GetTermVectors(docId); if (vectors == null) { // null snippet return; } Terms vector = vectors.GetTerms(fieldName); if (vector == null) { // null snippet return; } CharsRef spare = new CharsRef(); TermsEnum termsEnum = vector.GetEnumerator(); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.MaxDoc; while (termsEnum.MoveNext()) { text = termsEnum.Term; UnicodeUtil.UTF8toUTF16(text, spare); string term = spare.ToString(); if (!termSet.Contains(term)) { continue; } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); if (dpEnum == null) { // null snippet return; } dpEnum.NextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0); int freq = dpEnum.Freq; for (int i = 0; i < freq; i++) { int pos = dpEnum.NextPosition(); if (dpEnum.StartOffset < 0) { return; // no offsets, null snippet } termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight)); } } // sort by position CollectionUtil.TimSort(termList); // now look for dups at the same position, linking them together int currentPos = -1; TermInfo previous = null; TermInfo first = null; for (int i = 0; i < termList.Count;) { TermInfo current = termList[i]; if (current.Position == currentPos) { if (Debugging.AssertsEnabled) { Debugging.Assert(previous != null); } previous.SetNext(current); previous = current; //iterator.Remove(); // LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item) termList.RemoveAt(i); } else { if (previous != null) { previous.SetNext(first); } previous = first = current; currentPos = current.Position; // LUCENENET NOTE: Only increment the position if we don't do a delete. i++; } } if (previous != null) { previous.SetNext(first); } }
public virtual void TestCopy() { int numIters = AtLeast(10); for (int i = 0; i < numIters; i++) { CharsRef @ref = new CharsRef(); char[] charArray = TestUtil.RandomRealisticUnicodeString(Random(), 1, 100).ToCharArray(); int offset = Random().Next(charArray.Length); int length = charArray.Length - offset; string str = new string(charArray, offset, length); @ref.CopyChars(charArray, offset, length); Assert.AreEqual(str, @ref.ToString()); } }
public override void Build(InputIterator tfit) { if (tfit.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } root = new TernaryTreeNode(); // buffer first if (tfit.Comparator != BytesRef.UTF8SortedAsUTF16Comparator) { // make sure it's sorted and the comparator uses UTF16 sort order tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparator); } List<string> tokens = new List<string>(); List<Number> vals = new List<Number>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); tokens.Add(charsSpare.ToString()); vals.Add(Convert.ToInt64(tfit.Weight)); } autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root); }
private string ReadString(int offset, BytesRef scratch) { UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + offset, scratch.Length - offset, _scratchUtf16); return(_scratchUtf16.ToString()); }
public virtual void TestCopyCharsRef() { char[] chars = new char[] { 'a', 'b', 'c', 'd' }; CharsRef c = new CharsRef(chars, 1, 3); // bcd char[] otherchars = new char[] { 'b', 'c', 'd', 'e' }; c.CopyChars(new CharsRef(otherchars, 0, 4)); Assert.AreEqual("bcde", c.ToString()); }
public override void Build(IInputIterator tfit) { if (tfit.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (tfit.Comparator != null) { // make sure it's unsorted // WTF - this could result in yet another sorted iteration.... tfit = new UnsortedInputIterator(tfit); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } count = 0; trie = new JaspellTernarySearchTrie { MatchAlmostDiff = editDistance }; BytesRef spare; var charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { long weight = tfit.Weight; if (spare.Length == 0) { continue; } charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); trie.Put(charsSpare.ToString(), Convert.ToInt64(weight)); } }
public override void Build(IInputIterator tfit) { if (tfit.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } root = new TernaryTreeNode(); // buffer first #pragma warning disable 612, 618 if (tfit.Comparator != BytesRef.UTF8SortedAsUTF16Comparer) { // make sure it's sorted and the comparator uses UTF16 sort order tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparer); } #pragma warning restore 612, 618 List<string> tokens = new List<string>(); List<object> vals = new List<object>(); // LUCENENET TODO: Should this be long? in Java it was Number, but we can probably do better than object BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); tokens.Add(charsSpare.ToString()); vals.Add(Convert.ToInt64(tfit.Weight)); } autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root); }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString()); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.GetBytesReader(); // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; List <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(new ComparerAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.DisposeWhileHandlingException(ts); } }
public virtual void TestUTF8UTF16CharsRef() { int num = AtLeast(3989); for (int i = 0; i < num; i++) { string unicode = TestUtil.RandomRealisticUnicodeString(Random()); BytesRef @ref = new BytesRef(unicode); char[] arr = new char[1 + Random().Next(100)]; int offset = Random().Next(arr.Length); int len = Random().Next(arr.Length - offset); CharsRef cRef = new CharsRef(arr, offset, len); UnicodeUtil.UTF8toUTF16(@ref, cRef); Assert.AreEqual(cRef.ToString(), unicode); } }
/// <summary> /// Suggest similar words. /// /// <para> /// Unlike <see cref="SpellChecker"/>, the similarity used to fetch the most /// relevant terms is an edit distance, therefore typically a low value /// for numSug will work very well. /// </para> /// </summary> /// <param name="term"> Term you want to spell check on </param> /// <param name="numSug"> the maximum number of suggested words </param> /// <param name="ir"> IndexReader to find terms from </param> /// <param name="suggestMode"> specifies when to return suggested words </param> /// <param name="accuracy"> return only suggested words that match with this similarity </param> /// <returns> sorted list of the suggested words according to the comparer </returns> /// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception> public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy) { CharsRef spare = new CharsRef(); string text = term.Text(); if (minQueryLength > 0 && text.CodePointCount(0, text.Length) < minQueryLength) { return(new SuggestWord[0]); } if (lowerCaseTerms) { term = new Term(term.Field, text.ToLower()); } int docfreq = ir.DocFreq(term); if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) { return(new SuggestWord[0]); } int maxDoc = ir.MaxDoc; if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) { return(new SuggestWord[0]); } else if (docfreq > (int)Math.Ceiling(maxQueryFrequency * maxDoc)) { return(new SuggestWord[0]); } if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) { docfreq = 0; } if (thresholdFrequency >= 1f) { docfreq = Math.Max(docfreq, (int)thresholdFrequency); } else if (thresholdFrequency > 0f) { docfreq = Math.Max(docfreq, (int)(thresholdFrequency * maxDoc) - 1); } IEnumerable <ScoreTerm> terms = null; int inspections = numSug * maxInspections; // try ed=1 first, in case we get lucky terms = SuggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare); if (maxEdits > 1 && terms.Count() < inspections) { var moreTerms = new HashSet <ScoreTerm>(); moreTerms.AddAll(terms); moreTerms.AddAll(SuggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare)); terms = moreTerms; } // create the suggestword response, sort it, and trim it to size. var suggestions = new SuggestWord[terms.Count()]; int index = suggestions.Length - 1; foreach (ScoreTerm s in terms) { SuggestWord suggestion = new SuggestWord(); if (s.TermAsString == null) { UnicodeUtil.UTF8toUTF16(s.Term, spare); s.TermAsString = spare.ToString(); } suggestion.String = s.TermAsString; suggestion.Score = s.Score; suggestion.Freq = s.Docfreq; suggestions[index--] = suggestion; } ArrayUtil.TimSort(suggestions, Collections.ReverseOrder(comparer)); if (numSug < suggestions.Length) { SuggestWord[] trimmed = new SuggestWord[numSug]; Array.Copy(suggestions, 0, trimmed, 0, numSug); suggestions = trimmed; } return(suggestions); }
public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num) { if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } if (Debugging.AssertsEnabled) { Debugging.Assert(num > 0); } if (onlyMorePopular) { throw new ArgumentException("this suggester only works with onlyMorePopular=false"); } if (fst == null) { return(Collections.EmptyList <LookupResult>()); } BytesRef scratch = new BytesRef(key); int prefixLength = scratch.Length; FST.Arc <long?> arc = new FST.Arc <long?>(); // match the prefix portion exactly long?prefixOutput = null; try { prefixOutput = LookupPrefix(scratch, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } if (prefixOutput == null) { return(Collections.EmptyList <LookupResult>()); } List <LookupResult> results = new List <LookupResult>(num); CharsRef spare = new CharsRef(); if (exactFirst && arc.IsFinal) { spare.Grow(scratch.Length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), DecodeWeight(prefixOutput.GetValueOrDefault() + arc.NextFinalOutput.GetValueOrDefault()))); if (--num == 0) { return(results); // that was quick } } // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { completions = Lucene.Net.Util.Fst.Util.ShortestPaths(fst, arc, prefixOutput, weightComparer, num, !exactFirst); if (Debugging.AssertsEnabled) { Debugging.Assert(completions.IsComplete); } } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } BytesRef suffix = new BytesRef(8); foreach (Util.Fst.Util.Result <long?> completion in completions) { scratch.Length = prefixLength; // append suffix Lucene.Net.Util.Fst.Util.ToBytesRef(completion.Input, suffix); scratch.Append(suffix); spare.Grow(scratch.Length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), DecodeWeight(completion.Output.GetValueOrDefault()))); } return(results); }
public override List<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool higherWeightsFirst, int num) { if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } IList<FSTCompletion.Completion> completions; if (higherWeightsFirst) { completions = higherWeightsCompletion.DoLookup(key, num); } else { completions = normalCompletion.DoLookup(key, num); } List<LookupResult> results = new List<LookupResult>(completions.Count); CharsRef spare = new CharsRef(); foreach (FSTCompletion.Completion c in completions) { spare.Grow(c.utf8.Length); UnicodeUtil.UTF8toUTF16(c.utf8, spare); results.Add(new LookupResult(spare.ToString(), c.bucket)); } return results; }
/// <summary> /// Builds an <see cref="SynonymMap"/> and returns it. /// </summary> public virtual SynonymMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); HashSet <int?> dedupSet; if (dedup) { dedupSet = new HashSet <int?>(); } else { dedupSet = null; } var spare = new byte[5]; ICollection <CharsRef> keys = workingSet.Keys; CharsRef[] sortedKeys = keys.ToArray(); #pragma warning disable 612, 618 System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer); #pragma warning restore 612, 618 Int32sRef scratchIntsRef = new Int32sRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.Grow(estimatedSize); scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length); Debug.Assert(scratch.Offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once int?ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.WriteVInt32(output.ords[i]); count++; } int pos = scratchOutput.Position; scratchOutput.WriteVInt32(count << 1 | (output.includeOrig ? 0 : 1)); int pos2 = scratchOutput.Position; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.Length = scratchOutput.Position - scratch.Offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch)); } FST <BytesRef> fst = builder.Finish(); return(new SynonymMap(fst, words, maxHorizontalContext)); }
/// <summary> /// Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the /// given selection of fields from terms with a document frequency greater than /// the given maxDocFreq /// </summary> /// <param name="matchVersion"> Version to be used in <seealso cref="StopFilter"/> </param> /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param> /// <param name="indexReader"> IndexReader to identify the stopwords from </param> /// <param name="fields"> Selection of fields to calculate stopwords for </param> /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param> /// <exception cref="IOException"> Can be thrown while reading from the IndexReader </exception> public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection<string> fields, int maxDocFreq) : base(@delegate.Strategy) { this.matchVersion = matchVersion; this.@delegate = @delegate; foreach (string field in fields) { var stopWords = new HashSet<string>(); Terms terms = MultiFields.GetTerms(indexReader, field); CharsRef spare = new CharsRef(); if (terms != null) { TermsEnum te = terms.Iterator(null); BytesRef text; while ((text = te.Next()) != null) { if (te.DocFreq() > maxDocFreq) { UnicodeUtil.UTF8toUTF16(text, spare); stopWords.Add(spare.ToString()); } } } stopWordsPerField[field] = stopWords; } }
/// <summary> /// Provide spelling corrections based on several parameters. /// </summary> /// <param name="term"> The term to suggest spelling corrections for </param> /// <param name="numSug"> The maximum number of spelling corrections </param> /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param> /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param> /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param> /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param> /// <param name="spare"> a chars scratch </param> /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns> /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception> protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, CharsRef spare) { var atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); Terms terms = MultiFields.GetTerms(ir, term.Field); if (terms == null) { return(new List <ScoreTerm>()); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true); var stQueue = new Support.PriorityQueue <ScoreTerm>(); BytesRef queryTerm = new BytesRef(term.Text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>(); while ((candidateTerm = e.Next()) != null) { float boost = boostAtt.Boost; // ignore uncompetitive hits if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost) { continue; } // ignore exact match of the same term if (queryTerm.BytesEquals(candidateTerm)) { continue; } int df = e.DocFreq; // check docFreq if required if (df <= docfreq) { continue; } float score; string termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.ScaleFactor + e.MinSimilarity; } else { UnicodeUtil.UTF8toUTF16(candidateTerm, spare); termAsString = spare.ToString(); score = distance.GetDistance(term.Text(), termAsString); } if (score < accuracy) { continue; } // add new entry in PQ st.Term = BytesRef.DeepCopyOf(candidateTerm); st.Boost = boost; st.Docfreq = df; st.TermAsString = termAsString; st.Score = score; stQueue.Offer(st); // possibly drop entries from queue st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm(); maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity; } return(stQueue); }
public override IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, bool onlyMorePopular, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } Debug.Assert(num > 0); if (onlyMorePopular) { throw new System.ArgumentException("this suggester only works with onlyMorePopular=false"); } if (fst == null) { return(Collections.emptyList()); } BytesRef scratch = new BytesRef(key); int prefixLength = scratch.Length; Arc <long?> arc = new Arc <long?>(); // match the prefix portion exactly long?prefixOutput = null; try { prefixOutput = LookupPrefix(scratch, arc); } catch (IOException bogus) { throw new Exception(bogus); } if (prefixOutput == null) { return(Collections.emptyList()); } IList <LookupResult> results = new List <LookupResult>(num); CharsRef spare = new CharsRef(); if (exactFirst && arc.Final) { spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); if (--num == 0) { return(results); // that was quick } } // complete top-N TopResults <long?> completions = null; try { completions = Util.ShortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst); Debug.Assert(completions.isComplete); } catch (IOException bogus) { throw new Exception(bogus); } BytesRef suffix = new BytesRef(8); foreach (Result <long?> completion in completions) { scratch.length = prefixLength; // append suffix Util.ToBytesRef(completion.input, suffix); scratch.Append(suffix); spare.Grow(scratch.Length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), decodeWeight(completion.output))); } return(results); }