//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private void readField(util.BytesRef type, index.FieldInfo fieldInfo, index.StoredFieldVisitor visitor) throws java.io.IOException private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor) { readLine(); Debug.Assert(StringHelper.StartsWith(scratch, VALUE)); if (type == TYPE_STRING) { visitor.stringField(fieldInfo, new string(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, StandardCharsets.UTF_8)); } else if (type == TYPE_BINARY) { sbyte[] copy = new sbyte[scratch.length - VALUE.length]; Array.Copy(scratch.bytes, scratch.offset + VALUE.length, copy, 0, copy.Length); visitor.binaryField(fieldInfo, copy); } else if (type == TYPE_INT) { UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16); visitor.intField(fieldInfo, Convert.ToInt32(scratchUTF16.ToString())); } else if (type == TYPE_LONG) { UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16); visitor.longField(fieldInfo, Convert.ToInt64(scratchUTF16.ToString())); } else if (type == TYPE_FLOAT) { UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16); visitor.floatField(fieldInfo, Convert.ToSingle(scratchUTF16.ToString())); } else if (type == TYPE_DOUBLE) { UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset + VALUE.length, scratch.length - VALUE.length, scratchUTF16); visitor.doubleField(fieldInfo, Convert.ToDouble(scratchUTF16.ToString())); } }
public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool higherWeightsFirst, int num) { if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } IList <FSTCompletion.Completion> completions; if (higherWeightsFirst) { completions = higherWeightsCompletion.DoLookup(key, num); } else { completions = normalCompletion.DoLookup(key, num); } List <LookupResult> results = new List <LookupResult>(completions.Count); CharsRef spare = new CharsRef(); foreach (FSTCompletion.Completion c in completions) { spare.Grow(c.Utf8.Length); UnicodeUtil.UTF8toUTF16(c.Utf8, spare); results.Add(new LookupResult(spare.ToString(), c.Bucket)); } return(results); }
/// <summary>Decompress the byte array previously returned by /// compressString back into a String /// </summary> public static System.String DecompressString(byte[] value_Renamed) { UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result(); byte[] bytes = Decompress(value_Renamed); UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.Length, result); return(new System.String(result.result, 0, result.length)); }
public void Read(IndexInput input, FieldInfos fieldInfos) { this.term = null; // invalidate cache int start = input.ReadVInt(); int length = input.ReadVInt(); int totalLength = start + length; if (preUTF8Strings) { text.SetLength(totalLength); input.ReadChars(text.result, start, length); } else { if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: bytes.SetLength(totalLength); input.ReadBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); } } this.field = fieldInfos.FieldName(input.ReadVInt()); }
private LookupResult GetLookupResult(long?output1, BytesRef output2, CharsRef spare) { LookupResult result; if (hasPayloads) { int sepIndex = -1; for (int i = 0; i < output2.Length; i++) { if (output2.Bytes[output2.Offset + i] == PAYLOAD_SEP) { sepIndex = i; break; } } Debug.Assert(sepIndex != -1); spare.Grow(sepIndex); int payloadLen = output2.Length - sepIndex - 1; UnicodeUtil.UTF8toUTF16(output2.Bytes, output2.Offset, sepIndex, spare); BytesRef payload = new BytesRef(payloadLen); Array.Copy(output2.Bytes, sepIndex + 1, payload.Bytes, 0, payloadLen); payload.Length = payloadLen; result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()), payload); } else { spare.Grow(output2.Length); UnicodeUtil.UTF8toUTF16(output2, spare); result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault())); } return(result); }
public override void Build(IInputIterator tfit) { if (tfit.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } root = new TernaryTreeNode(); // buffer first #pragma warning disable 612, 618 if (tfit.Comparer != BytesRef.UTF8SortedAsUTF16Comparer) { // make sure it's sorted and the comparer uses UTF16 sort order tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparer); } #pragma warning restore 612, 618 List <string> tokens = new List <string>(); List <object> vals = new List <object>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); tokens.Add(charsSpare.ToString()); vals.Add(Convert.ToInt64(tfit.Weight)); } autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root); }
public override void Build(InputIterator tfit) { if (tfit.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (tfit.Comparator != null) { // make sure it's unsorted // WTF - this could result in yet another sorted iteration.... tfit = new UnsortedInputIterator(tfit); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } count = 0; trie = new JaspellTernarySearchTrie(); trie.MatchAlmostDiff = editDistance; BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { long weight = tfit.Weight; if (spare.Length == 0) { continue; } charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); trie.Put(charsSpare.ToString(), Convert.ToInt64(weight)); } }
/// <summary> /// Adds terms and frequencies found in vector into the <see cref="T:IDictionary{string, Int}"/> <paramref name="termFreqMap"/> /// </summary> /// <param name="termFreqMap"> a <see cref="T:IDictionary{string, Int}"/> of terms and their frequencies </param> /// <param name="vector"> List of terms and their frequencies for a doc/field </param> private void AddTermFrequencies(IDictionary <string, Int32> termFreqMap, Terms vector) { var termsEnum = vector.GetIterator(null); var spare = new CharsRef(); BytesRef text; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); var term = spare.ToString(); if (IsNoiseWord(term)) { continue; } var freq = (int)termsEnum.TotalTermFreq; // increment frequency Int32 cnt; if (!termFreqMap.TryGetValue(term, out cnt)) { cnt = new Int32(); termFreqMap[term] = cnt; cnt.x = freq; } else { cnt.x += freq; } } }
public override bool IncrementToken() { if (m_input.IncrementToken()) { if (fstReader == null) { // No overrides return(true); } if (!keywordAtt.IsKeyword) // don't muck with already-keyworded terms { BytesRef stem = stemmerOverrideMap.Get(termAtt.Buffer, termAtt.Length, scratchArc, fstReader); if (stem != null) { char[] buffer = spare.Chars = termAtt.Buffer; UnicodeUtil.UTF8toUTF16(stem.Bytes, stem.Offset, stem.Length, spare); if (spare.Chars != buffer) { termAtt.CopyBuffer(spare.Chars, spare.Offset, spare.Length); } termAtt.Length = spare.Length; keywordAtt.IsKeyword = true; } } return(true); } else { return(false); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (input.incrementToken()) { if (fstReader == null) { // No overrides return(true); } if (!keywordAtt.Keyword) // don't muck with already-keyworded terms { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.BytesRef stem = stemmerOverrideMap.get(termAtt.buffer(), termAtt.length(), scratchArc, fstReader); BytesRef stem = stemmerOverrideMap.get(termAtt.buffer(), termAtt.length(), scratchArc, fstReader); if (stem != null) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char[] buffer = spare.chars = termAtt.buffer(); char[] buffer = spare.chars = termAtt.buffer(); UnicodeUtil.UTF8toUTF16(stem.bytes, stem.offset, stem.length, spare); if (spare.chars != buffer) { termAtt.copyBuffer(spare.chars, spare.offset, spare.length); } termAtt.Length = spare.length; keywordAtt.Keyword = true; } } return(true); } else { return(false); } }
public virtual void TestRandomUnicodeStrings() { char[] buffer = new char[20]; char[] expected = new char[20]; BytesRef utf8 = new BytesRef(20); CharsRef utf16 = new CharsRef(20); int num = AtLeast(100000); for (int iter = 0; iter < num; iter++) { bool hasIllegal = FillUnicode(buffer, expected, 0, 20); UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8); if (!hasIllegal) { #pragma warning disable 612, 618 var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8); #pragma warning restore 612, 618 Assert.AreEqual(b.Length, utf8.Length); for (int i = 0; i < b.Length; i++) { Assert.AreEqual(b[i], utf8.Bytes[i]); } } UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); Assert.AreEqual(utf16.Length, 20); for (int i = 0; i < 20; i++) { Assert.AreEqual(expected[i], utf16.Chars[i]); } } }
/// <summary> /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for the /// given selection of fields from terms with a document frequency greater than /// the given <paramref name="maxDocFreq"/> /// </summary> /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param> /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param> /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param> /// <param name="fields"> Selection of fields to calculate stopwords for </param> /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param> /// <exception cref="IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception> public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection <string> fields, int maxDocFreq) : base(@delegate.Strategy) { this.matchVersion = matchVersion; this.@delegate = @delegate; foreach (string field in fields) { var stopWords = new JCG.HashSet <string>(); Terms terms = MultiFields.GetTerms(indexReader, field); CharsRef spare = new CharsRef(); if (terms != null) { TermsEnum te = terms.GetEnumerator(); while (te.MoveNext()) { if (te.DocFreq > maxDocFreq) { UnicodeUtil.UTF8toUTF16(te.Term, spare); stopWords.Add(spare.ToString()); } } } stopWordsPerField[field] = stopWords; } }
/// <summary> /// Decompress the <see cref="byte"/> array previously returned by /// <see cref="CompressString(string)"/> back into a <see cref="string"/> /// </summary> public static string DecompressString(byte[] value, int offset, int length) { byte[] bytes = Decompress(value, offset, length); CharsRef result = new CharsRef(bytes.Length); UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.Length, result); return(new string(result.Chars, 0, result.Length)); }
// Currently used only by assert statement private int CompareToLastTerm(int fieldNumber, BytesRef term) { if (lastFieldNumber != fieldNumber) { int cmp = FieldName(fieldInfos, lastFieldNumber).CompareToOrdinal(FieldName(fieldInfos, fieldNumber)); // If there is a field named "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". But // it's not OK if two different field numbers map to // the same name. if (cmp != 0 || lastFieldNumber != -1) { return(cmp); } } scratchBytes.CopyBytes(term); if (Debugging.AssertsEnabled) { Debugging.Assert(lastTerm.Offset == 0); } UnicodeUtil.UTF8toUTF16(lastTerm.Bytes, 0, lastTerm.Length, utf16Result1); if (Debugging.AssertsEnabled) { Debugging.Assert(scratchBytes.Offset == 0); } UnicodeUtil.UTF8toUTF16(scratchBytes.Bytes, 0, scratchBytes.Length, utf16Result2); int len; if (utf16Result1.Length < utf16Result2.Length) { len = utf16Result1.Length; } else { len = utf16Result2.Length; } for (int i = 0; i < len; i++) { char ch1 = utf16Result1.Chars[i]; char ch2 = utf16Result2.Chars[i]; if (ch1 != ch2) { return(ch1 - ch2); } } if (utf16Result1.Length == 0 && lastFieldNumber == -1) { // If there is a field named "" (empty string) with a term text of "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". return(-1); } return(utf16Result1.Length - utf16Result2.Length); }
public override string StrVal(int doc) { m_termsIndex.Get(doc, m_spare); if (m_spare.Length == 0) { return(null); } UnicodeUtil.UTF8toUTF16(m_spare, m_spareChars); return(m_spareChars.ToString()); }
public override int NextPosition() { int pos; if (_readPositions) { SimpleTextUtil.ReadLine(_in, _scratch); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.POS), "got line=" + _scratch.Utf8ToString()); UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.POS.Length, _scratch.Length - SimpleTextFieldsWriter.POS.Length, _scratchUtf162); pos = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length); } else { pos = -1; } if (_readOffsets) { SimpleTextUtil.ReadLine(_in, _scratch); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.START_OFFSET), "got line=" + _scratch.Utf8ToString()); UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.START_OFFSET.Length, _scratch.Length - SimpleTextFieldsWriter.START_OFFSET.Length, _scratchUtf162); _startOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length); SimpleTextUtil.ReadLine(_in, _scratch); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END_OFFSET), "got line=" + _scratch.Utf8ToString()); UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.END_OFFSET.Length, _scratch.Length - SimpleTextFieldsWriter.END_OFFSET.Length, _scratchUtf162); _endOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length); } long fp = _in.GetFilePointer(); SimpleTextUtil.ReadLine(_in, _scratch); if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.PAYLOAD)) { int len = _scratch.Length - SimpleTextFieldsWriter.PAYLOAD.Length; if (_scratch2.Bytes.Length < len) { _scratch2.Grow(len); } Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.PAYLOAD.Length, _scratch2.Bytes, 0, len); _scratch2.Length = len; _payload = _scratch2; } else { _payload = null; _in.Seek(fp); } return(pos); }
public override int NextPosition() { int pos; if (readPositions) { SimpleTextUtil.ReadLine(@in, scratch); Debug.Assert(StringHelper.StartsWith(scratch, POS), "got line=" + scratch.Utf8ToString()); UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.Offset + POS.length, scratch.Length - POS.length, scratchUTF16_2); pos = ArrayUtil.ParseInt(scratchUTF16_2.Chars, 0, scratchUTF16_2.length); } else { pos = -1; } if (readOffsets) { SimpleTextUtil.ReadLine(@in, scratch); Debug.Assert(StringHelper.StartsWith(scratch, START_OFFSET), "got line=" + scratch.Utf8ToString()); UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.Offset + START_OFFSET.length, scratch.Length - START_OFFSET.length, scratchUTF16_2); startOffset_Renamed = ArrayUtil.ParseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); SimpleTextUtil.ReadLine(@in, scratch); Debug.Assert(StringHelper.StartsWith(scratch, END_OFFSET), "got line=" + scratch.Utf8ToString()); UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + END_OFFSET.length, scratch.Length - END_OFFSET.length, scratchUTF16_2); endOffset_Renamed = ArrayUtil.ParseInt(scratchUTF16_2.Chars, 0, scratchUTF16_2.length); } long fp = @in.FilePointer; SimpleTextUtil.ReadLine(@in, scratch); if (StringHelper.StartsWith(scratch, PAYLOAD)) { int len = scratch.Length - PAYLOAD.length; if (scratch2.Bytes.Length < len) { scratch2.Grow(len); } Array.Copy(scratch.Bytes, PAYLOAD.length, scratch2.Bytes, 0, len); scratch2.Length = len; payload = scratch2; } else { payload = null; @in.Seek(fp); } return(pos); }
/// <summary> /// Build a minimal, deterministic automaton from a sorted list of <see cref="BytesRef"/> representing /// strings in UTF-8. These strings must be binary-sorted. /// </summary> public static Automaton Build(ICollection <BytesRef> input) { DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); CharsRef scratch = new CharsRef(); foreach (BytesRef b in input) { UnicodeUtil.UTF8toUTF16(b, scratch); builder.Add(scratch); } return(new Automaton { initial = Convert(builder.Complete(), new JCG.Dictionary <State, Lucene.Net.Util.Automaton.State>(IdentityEqualityComparer <State> .Default)), deterministic = true }); }
/// <summary> /// Build a minimal, deterministic automaton from a sorted list of <seealso cref="BytesRef"/> representing /// strings in UTF-8. These strings must be binary-sorted. /// </summary> public static Automaton Build(ICollection <BytesRef> input) { DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); CharsRef scratch = new CharsRef(); foreach (BytesRef b in input) { UnicodeUtil.UTF8toUTF16(b, scratch); builder.Add(scratch); } Automaton a = new Automaton(); a.initial = Convert(builder.Complete(), new IdentityHashMap <State, Lucene.Net.Util.Automaton.State>()); a.deterministic = true; return(a); }
private void ReadField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor) { ReadLine(); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextStoredFieldsWriter.VALUE)); } if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_STRING)) { visitor.StringField(fieldInfo, Encoding.UTF8.GetString(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_BINARY)) { var copy = new byte[_scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length]; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, copy, 0, copy.Length); visitor.BinaryField(fieldInfo, copy); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_INT)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.Int32Field(fieldInfo, J2N.Numerics.Int32.Parse(_scratchUtf16.ToString(), NumberFormatInfo.InvariantInfo)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_LONG)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.Int64Field(fieldInfo, J2N.Numerics.Int64.Parse(_scratchUtf16.ToString(), NumberFormatInfo.InvariantInfo)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_FLOAT)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.SingleField(fieldInfo, J2N.Numerics.Single.Parse(_scratchUtf16.ToString(), NumberStyle.Float, NumberFormatInfo.InvariantInfo)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_DOUBLE)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.DoubleField(fieldInfo, J2N.Numerics.Double.Parse(_scratchUtf16.ToString(), NumberStyle.Float, NumberFormatInfo.InvariantInfo)); } }
public override void Build(IInputEnumerator enumerator) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.Comparer != null) { // make sure it's unsorted // WTF - this could result in yet another sorted iteration.... enumerator = new UnsortedInputEnumerator(enumerator); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } count = 0; trie = new JaspellTernarySearchTrie { MatchAlmostDiff = editDistance }; BytesRef spare; var charsSpare = new CharsRef(); while (enumerator.MoveNext()) { spare = enumerator.Current; long weight = enumerator.Weight; if (spare.Length == 0) { continue; } charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); trie.Put(charsSpare.ToString(), weight); } }
public virtual void TestAllUnicodeChars() { BytesRef utf8 = new BytesRef(10); CharsRef utf16 = new CharsRef(10); char[] chars = new char[2]; for (int ch = 0; ch < 0x0010FFFF; ch++) { if (ch == 0xd800) // Skip invalid code points { ch = 0xe000; } int len = 0; if (ch <= 0xffff) { chars[len++] = (char)ch; } else { chars[len++] = (char)(((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START); chars[len++] = (char)(((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START); } UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8); string s1 = new string(chars, 0, len); string s2 = Encoding.UTF8.GetString(utf8.Bytes, utf8.Offset, utf8.Length); Assert.AreEqual(s1, s2, "codepoint " + ch); UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); Assert.AreEqual(s1, new string(utf16.Chars, 0, utf16.Length), "codepoint " + ch); var b = s1.GetBytes(Encoding.UTF8); Assert.AreEqual(utf8.Length, b.Length); for (int j = 0; j < utf8.Length; j++) { Assert.AreEqual(utf8.Bytes[j], b[j]); } } }
private void ReadField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor) { ReadLine(); Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextStoredFieldsWriter.VALUE)); if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_STRING)) { visitor.StringField(fieldInfo, Encoding.UTF8.GetString(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_BINARY)) { var copy = new byte[_scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length]; Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, copy, 0, copy.Length); visitor.BinaryField(fieldInfo, copy); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_INT)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.Int32Field(fieldInfo, Convert.ToInt32(_scratchUtf16.ToString(), CultureInfo.InvariantCulture)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_LONG)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.Int64Field(fieldInfo, Convert.ToInt64(_scratchUtf16.ToString(), CultureInfo.InvariantCulture)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_FLOAT)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.SingleField(fieldInfo, Convert.ToSingle(_scratchUtf16.ToString(), CultureInfo.InvariantCulture)); } else if (Equals(type, SimpleTextStoredFieldsWriter.TYPE_DOUBLE)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextStoredFieldsWriter.VALUE.Length, _scratch.Length - SimpleTextStoredFieldsWriter.VALUE.Length, _scratchUtf16); visitor.DoubleField(fieldInfo, Convert.ToDouble(_scratchUtf16.ToString(), CultureInfo.InvariantCulture)); } }
public override void Build(IInputEnumerator enumerator) { // LUCENENT: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } root = new TernaryTreeNode(); // buffer first #pragma warning disable 612, 618 if (enumerator.Comparer != BytesRef.UTF8SortedAsUTF16Comparer) { // make sure it's sorted and the comparer uses UTF16 sort order enumerator = new SortedInputEnumerator(enumerator, BytesRef.UTF8SortedAsUTF16Comparer); } #pragma warning restore 612, 618 JCG.List <string> tokens = new JCG.List <string>(); JCG.List <object> vals = new JCG.List <object>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while (enumerator.MoveNext()) { spare = enumerator.Current; charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); tokens.Add(charsSpare.ToString()); vals.Add(enumerator.Weight); } autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root); }
// Currently used only by assert statement private int CompareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { if (lastFieldNumber != fieldNumber) { int cmp = String.CompareOrdinal(fieldInfos.FieldName(lastFieldNumber), fieldInfos.FieldName(fieldNumber)); // If there is a field named "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". But // it's not OK if two different field numbers map to // the same name. if (cmp != 0 || lastFieldNumber != -1) { return(cmp); } } UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); int len; if (utf16Result1.length < utf16Result2.length) { len = utf16Result1.length; } else { len = utf16Result2.length; } for (int i = 0; i < len; i++) { char ch1 = utf16Result1.result[i]; char ch2 = utf16Result2.result[i]; if (ch1 != ch2) { return(ch1 - ch2); } } return(utf16Result1.length - utf16Result2.length); }
/// <summary> /// Adds terms and frequencies found in vector into the Map termFreqMap /// </summary> /// <param name="termFreqMap"> a Map of terms and their frequencies </param> /// <param name="vector"> List of terms and their frequencies for a doc/field </param> private void AddTermFrequencies(IDictionary <string, Int> termFreqMap, Terms vector) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = vector.iterator(null); TermsEnum termsEnum = vector.Iterator(null); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.CharsRef spare = new org.apache.lucene.util.CharsRef(); CharsRef spare = new CharsRef(); BytesRef text; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String term = spare.toString(); string term = spare.ToString(); if (IsNoiseWord(term)) { continue; } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int freq = (int) termsEnum.totalTermFreq(); int freq = (int)termsEnum.TotalTermFreq(); // increment frequency Int cnt = termFreqMap[term]; if (cnt == null) { cnt = new Int(); termFreqMap[term] = cnt; cnt.x = freq; } else { cnt.x += freq; } } }
/// <summary> /// Provide spelling corrections based on several parameters. /// </summary> /// <param name="term"> The term to suggest spelling corrections for </param> /// <param name="numSug"> The maximum number of spelling corrections </param> /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param> /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param> /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param> /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param> /// <param name="spare"> a chars scratch </param> /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns> /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception> protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, CharsRef spare) { var atts = new AttributeSource(); IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>(); Terms terms = MultiFields.GetTerms(ir, term.Field); if (terms == null) { return(new List <ScoreTerm>()); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true); var stQueue = new Support.PriorityQueue <ScoreTerm>(); BytesRef queryTerm = new BytesRef(term.Text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>(); while ((candidateTerm = e.Next()) != null) { float boost = boostAtt.Boost; // ignore uncompetitive hits if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost) { continue; } // ignore exact match of the same term if (queryTerm.BytesEquals(candidateTerm)) { continue; } int df = e.DocFreq; // check docFreq if required if (df <= docfreq) { continue; } float score; string termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.ScaleFactor + e.MinSimilarity; } else { UnicodeUtil.UTF8toUTF16(candidateTerm, spare); termAsString = spare.ToString(); score = distance.GetDistance(term.Text(), termAsString); } if (score < accuracy) { continue; } // add new entry in PQ st.Term = BytesRef.DeepCopyOf(candidateTerm); st.Boost = boost; st.Docfreq = df; st.TermAsString = termAsString; st.Score = score; stQueue.Offer(st); // possibly drop entries from queue st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm(); maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity; } return(stQueue); }
/// <summary> /// Suggest similar words. /// /// <para> /// Unlike <see cref="SpellChecker"/>, the similarity used to fetch the most /// relevant terms is an edit distance, therefore typically a low value /// for numSug will work very well. /// </para> /// </summary> /// <param name="term"> Term you want to spell check on </param> /// <param name="numSug"> the maximum number of suggested words </param> /// <param name="ir"> IndexReader to find terms from </param> /// <param name="suggestMode"> specifies when to return suggested words </param> /// <param name="accuracy"> return only suggested words that match with this similarity </param> /// <returns> sorted list of the suggested words according to the comparer </returns> /// <exception cref="System.IO.IOException"> If there is a low-level I/O error. </exception> public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy) { CharsRef spare = new CharsRef(); string text = term.Text(); if (minQueryLength > 0 && text.CodePointCount(0, text.Length) < minQueryLength) { return(new SuggestWord[0]); } if (lowerCaseTerms) { term = new Term(term.Field, text.ToLower()); } int docfreq = ir.DocFreq(term); if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) { return(new SuggestWord[0]); } int maxDoc = ir.MaxDoc; if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) { return(new SuggestWord[0]); } else if (docfreq > (int)Math.Ceiling(maxQueryFrequency * maxDoc)) { return(new SuggestWord[0]); } if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) { docfreq = 0; } if (thresholdFrequency >= 1f) { docfreq = Math.Max(docfreq, (int)thresholdFrequency); } else if (thresholdFrequency > 0f) { docfreq = Math.Max(docfreq, (int)(thresholdFrequency * maxDoc) - 1); } IEnumerable <ScoreTerm> terms = null; int inspections = numSug * maxInspections; // try ed=1 first, in case we get lucky terms = SuggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare); if (maxEdits > 1 && terms.Count() < inspections) { var moreTerms = new HashSet <ScoreTerm>(); moreTerms.AddAll(terms); moreTerms.AddAll(SuggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare)); terms = moreTerms; } // create the suggestword response, sort it, and trim it to size. var suggestions = new SuggestWord[terms.Count()]; int index = suggestions.Length - 1; foreach (ScoreTerm s in terms) { SuggestWord suggestion = new SuggestWord(); if (s.TermAsString == null) { UnicodeUtil.UTF8toUTF16(s.Term, spare); s.TermAsString = spare.ToString(); } suggestion.String = s.TermAsString; suggestion.Score = s.Score; suggestion.Freq = s.Docfreq; suggestions[index--] = suggestion; } ArrayUtil.TimSort(suggestions, Collections.ReverseOrder(comparer)); if (numSug < suggestions.Length) { SuggestWord[] trimmed = new SuggestWord[numSug]; Array.Copy(suggestions, 0, trimmed, 0, numSug); suggestions = trimmed; } return(suggestions); }
public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num) { if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } if (Debugging.AssertsEnabled) { Debugging.Assert(num > 0); } if (onlyMorePopular) { throw new ArgumentException("this suggester only works with onlyMorePopular=false"); } if (fst == null) { return(Collections.EmptyList <LookupResult>()); } BytesRef scratch = new BytesRef(key); int prefixLength = scratch.Length; FST.Arc <long?> arc = new FST.Arc <long?>(); // match the prefix portion exactly long?prefixOutput = null; try { prefixOutput = LookupPrefix(scratch, arc); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } if (prefixOutput == null) { return(Collections.EmptyList <LookupResult>()); } List <LookupResult> results = new List <LookupResult>(num); CharsRef spare = new CharsRef(); if (exactFirst && arc.IsFinal) { spare.Grow(scratch.Length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), DecodeWeight(prefixOutput.GetValueOrDefault() + arc.NextFinalOutput.GetValueOrDefault()))); if (--num == 0) { return(results); // that was quick } } // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { completions = Lucene.Net.Util.Fst.Util.ShortestPaths(fst, arc, prefixOutput, weightComparer, num, !exactFirst); if (Debugging.AssertsEnabled) { Debugging.Assert(completions.IsComplete); } } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } BytesRef suffix = new BytesRef(8); foreach (Util.Fst.Util.Result <long?> completion in completions) { scratch.Length = prefixLength; // append suffix Lucene.Net.Util.Fst.Util.ToBytesRef(completion.Input, suffix); scratch.Append(suffix); spare.Grow(scratch.Length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.Add(new LookupResult(spare.ToString(), DecodeWeight(completion.Output.GetValueOrDefault()))); } return(results); }
// Interleaves all output tokens onto the futureOutputs: private void AddOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) { bytesReader.Reset(bytes.Bytes, bytes.Offset, bytes.Length); int code = bytesReader.ReadVInt32(); bool keepOrig = (code & 0x1) == 0; int count = (int)((uint)code >> 1); //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig); for (int outputIDX = 0; outputIDX < count; outputIDX++) { synonyms.Words.Get(bytesReader.ReadVInt32(), scratchBytes); //System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length); UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars); int lastStart = scratchChars.Offset; int chEnd = lastStart + scratchChars.Length; int outputUpto = nextRead; for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { if (chIDX == chEnd || scratchChars.Chars[chIDX] == SynonymMap.WORD_SEPARATOR) { int outputLen = chIDX - lastStart; // Caller is not allowed to have empty string in // the output: if (Debugging.AssertsEnabled) { Debugging.Assert(outputLen > 0, "output contains empty string: {0}", scratchChars); } int endOffset; int posLen; if (chIDX == chEnd && lastStart == scratchChars.Offset) { // This rule had a single output token, so, we set // this output's endOffset to the current // endOffset (ie, endOffset of the last input // token it matched): endOffset = matchEndOffset; posLen = keepOrig ? matchInputLength : 1; } else { // This rule has more than one output token; we // can't pick any particular endOffset for this // case, so, we inherit the endOffset for the // input token which this output overlaps: endOffset = -1; posLen = 1; } futureOutputs[outputUpto].Add(scratchChars.Chars, lastStart, outputLen, endOffset, posLen); //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto); lastStart = 1 + chIDX; //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); outputUpto = RollIncr(outputUpto); if (Debugging.AssertsEnabled) { Debugging.Assert(futureOutputs[outputUpto].posIncr == 1, "outputUpto={0} vs nextWrite={1}", outputUpto, nextWrite); } } } } int upto = nextRead; for (int idx = 0; idx < matchInputLength; idx++) { futureInputs[upto].keepOrig |= keepOrig; futureInputs[upto].matched = true; upto = RollIncr(upto); } }