public virtual void TestMaxPosition3WithSynomyms() { foreach (bool consumeAll in new bool[] { true, false }) { MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false); // if we are consuming all tokens, we can use the checks, otherwise we can't tokenizer.EnableChecks = consumeAll; SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.Add(new CharsRef("one"), new CharsRef("first"), true); builder.Add(new CharsRef("one"), new CharsRef("alpha"), true); builder.Add(new CharsRef("one"), new CharsRef("beguine"), true); CharsRef multiWordCharsRef = new CharsRef(); SynonymMap.Builder.Join(new string[] { "and", "indubitably", "single", "only" }, multiWordCharsRef); builder.Add(new CharsRef("one"), multiWordCharsRef, true); SynonymMap.Builder.Join(new string[] { "dopple", "ganger" }, multiWordCharsRef); builder.Add(new CharsRef("two"), multiWordCharsRef, true); SynonymMap synonymMap = builder.Build(); TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); stream = new LimitTokenPositionFilter(stream, 3, consumeAll); // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. AssertTokenStreamContents(stream, new string[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" }, new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 }); } }
public override void Build(IInputIterator tfit) { if (tfit.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } root = new TernaryTreeNode(); // buffer first #pragma warning disable 612, 618 if (tfit.Comparer != BytesRef.UTF8SortedAsUTF16Comparer) { // make sure it's sorted and the comparer uses UTF16 sort order tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparer); } #pragma warning restore 612, 618 List <string> tokens = new List <string>(); List <object> vals = new List <object>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); tokens.Add(charsSpare.ToString()); vals.Add(Convert.ToInt64(tfit.Weight)); } autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root); }
/// <summary> /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for the /// given selection of fields from terms with a document frequency greater than /// the given <paramref name="maxDocFreq"/> /// </summary> /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param> /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param> /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param> /// <param name="fields"> Selection of fields to calculate stopwords for </param> /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param> /// <exception cref="IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception> public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection <string> fields, int maxDocFreq) : base(@delegate.Strategy) { this.matchVersion = matchVersion; this.@delegate = @delegate; foreach (string field in fields) { var stopWords = new JCG.HashSet <string>(); Terms terms = MultiFields.GetTerms(indexReader, field); CharsRef spare = new CharsRef(); if (terms != null) { TermsEnum te = terms.GetEnumerator(); while (te.MoveNext()) { if (te.DocFreq > maxDocFreq) { UnicodeUtil.UTF8toUTF16(te.Term, spare); stopWords.Add(spare.ToString()); } } } stopWordsPerField[field] = stopWords; } }
/// <summary> /// Copy <paramref name="current"/> into an internal buffer. /// </summary> private bool SetPrevious(CharsRef current) { // don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277 // still, called only from assert previous = CharsRef.DeepCopyOf(current); return(true); }
/// <summary> /// Sugar: just joins the provided terms with /// <see cref="SynonymMap.WORD_SEPARATOR"/>. reuse and its chars /// must not be null. /// </summary> public static CharsRef Join(string[] words, CharsRef reuse) { int upto = 0; char[] buffer = reuse.Chars; foreach (string word in words) { int wordLen = word.Length; int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR if (needed > buffer.Length) { reuse.Grow(needed); buffer = reuse.Chars; } if (upto > 0) { buffer[upto++] = SynonymMap.WORD_SEPARATOR; } word.CopyTo(0, buffer, upto, wordLen - 0); upto += wordLen; } reuse.Length = upto; return(reuse); }
public virtual void TestRandomUnicodeStrings() { char[] buffer = new char[20]; char[] expected = new char[20]; BytesRef utf8 = new BytesRef(20); CharsRef utf16 = new CharsRef(20); int num = AtLeast(100000); for (int iter = 0; iter < num; iter++) { bool hasIllegal = FillUnicode(buffer, expected, 0, 20); UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8); if (!hasIllegal) { #pragma warning disable 612, 618 var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8); #pragma warning restore 612, 618 Assert.AreEqual(b.Length, utf8.Length); for (int i = 0; i < b.Length; i++) { Assert.AreEqual(b[i], utf8.Bytes[i]); } } UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); Assert.AreEqual(utf16.Length, 20); for (int i = 0; i < 20; i++) { Assert.AreEqual(expected[i], utf16.Chars[i]); } } }
public virtual void TestCharSequenceCharAt() { CharsRef c = new CharsRef("abc"); Assert.AreEqual('b', c.CharAt(1)); try { c.CharAt(-1); Assert.Fail(); } catch (System.IndexOutOfRangeException expected) { // expected exception } try { c.CharAt(3); Assert.Fail(); } catch (System.IndexOutOfRangeException expected) { // expected exception } }
public virtual void TestAppendChars() { char[] chars = new char[] { 'a', 'b', 'c', 'd' }; CharsRef c = new CharsRef(chars, 1, 3); // bcd c.Append(new char[] { 'e' }, 0, 1); Assert.AreEqual("bcde", c.ToString()); }
#pragma warning restore 612, 618 /// <summary> /// Add another character sequence to this automaton. The sequence must be /// lexicographically larger or equal compared to any previous sequences added /// to this automaton (the input must be sorted). /// </summary> public void Add(CharsRef current) { if (Debugging.AssertsEnabled) { Debugging.Assert(stateRegistry != null, "Automaton already built."); Debugging.Assert(previous == null || comparer.Compare(previous, current) <= 0, "Input must be in sorted UTF-8 order: {0} >= {1}", previous, current); Debugging.Assert(SetPrevious(current)); } // Descend in the automaton (find matching prefix). int pos = 0, max = current.Length; State next, state = root; while (pos < max && (next = state.LastChild(Character.CodePointAt(current, pos))) != null) { state = next; // todo, optimize me pos += Character.CharCount(Character.CodePointAt(current, pos)); } if (state.HasChildren) { ReplaceOrRegister(state); } AddSuffix(state, current, pos); }
public override void Reset() { input.Reset(); buffer.reset(input); replacement = null; inputOff = 0; }
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap <string> stemOverrideDict) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); if (stemOverrideDict.Empty || !matchVersion.onOrAfter(Version.LUCENE_31)) { this.stemdict = null; this.origStemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap <string> .EntryIterator iter = stemOverrideDict.entrySet().GetEnumerator(); CharsRef spare = new CharsRef(); while (iter.hasNext()) { char[] nextKey = iter.nextKey(); spare.copyChars(nextKey, 0, nextKey.Length); builder.add(spare, iter.currentValue()); } try { this.stemdict = builder.build(); } catch (IOException ex) { throw new Exception("can not build stem dict", ex); } } }
private LookupResult GetLookupResult(long?output1, BytesRef output2, CharsRef spare) { LookupResult result; if (hasPayloads) { int sepIndex = -1; for (int i = 0; i < output2.Length; i++) { if (output2.Bytes[output2.Offset + i] == PAYLOAD_SEP) { sepIndex = i; break; } } Debug.Assert(sepIndex != -1); spare.Grow(sepIndex); int payloadLen = output2.Length - sepIndex - 1; UnicodeUtil.UTF8toUTF16(output2.Bytes, output2.Offset, sepIndex, spare); BytesRef payload = new BytesRef(payloadLen); Array.Copy(output2.Bytes, sepIndex + 1, payload.Bytes, 0, payloadLen); payload.Length = payloadLen; result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()), payload); } else { spare.Grow(output2.Length); UnicodeUtil.UTF8toUTF16(output2, spare); result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault())); } return(result); }
/// <summary> /// Sugar: just joins the provided terms with {@link /// SynonymMap#WORD_SEPARATOR}. reuse and its chars /// must not be null. /// </summary> public static CharsRef join(string[] words, CharsRef reuse) { int upto = 0; char[] buffer = reuse.chars; foreach (string word in words) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int wordLen = word.length(); int wordLen = word.Length; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR if (needed > buffer.Length) { reuse.grow(needed); buffer = reuse.chars; } if (upto > 0) { buffer[upto++] = SynonymMap.WORD_SEPARATOR; } word.CopyTo(0, buffer, upto, wordLen - 0); upto += wordLen; } reuse.length = upto; return(reuse); }
public virtual void Add(char[] output, int offset, int len, int endOffset, int posLength) { if (count == outputs.Length) { CharsRef[] next = new CharsRef[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; Array.Copy(outputs, 0, next, 0, count); outputs = next; } if (count == endOffsets.Length) { int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT32)]; Array.Copy(endOffsets, 0, next, 0, count); endOffsets = next; } if (count == posLengths.Length) { int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT32)]; Array.Copy(posLengths, 0, next, 0, count); posLengths = next; } if (outputs[count] == null) { outputs[count] = new CharsRef(); } outputs[count].CopyChars(output, offset, len); // endOffset can be -1, in which case we should simply // use the endOffset of the input token, or X >= 0, in // which case we use X as the endOffset for this output endOffsets[count] = endOffset; posLengths[count] = posLength; count++; }
public override void Build(InputIterator tfit) { if (tfit.HasPayloads) { throw new ArgumentException("this suggester doesn't support payloads"); } if (tfit.Comparator != null) { // make sure it's unsorted // WTF - this could result in yet another sorted iteration.... tfit = new UnsortedInputIterator(tfit); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } count = 0; trie = new JaspellTernarySearchTrie(); trie.MatchAlmostDiff = editDistance; BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { long weight = tfit.Weight; if (spare.Length == 0) { continue; } charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); trie.Put(charsSpare.ToString(), Convert.ToInt64(weight)); } }
public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool higherWeightsFirst, int num) { if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } IList <FSTCompletion.Completion> completions; if (higherWeightsFirst) { completions = higherWeightsCompletion.DoLookup(key, num); } else { completions = normalCompletion.DoLookup(key, num); } List <LookupResult> results = new List <LookupResult>(completions.Count); CharsRef spare = new CharsRef(); foreach (FSTCompletion.Completion c in completions) { spare.Grow(c.Utf8.Length); UnicodeUtil.UTF8toUTF16(c.Utf8, spare); results.Add(new LookupResult(spare.ToString(), c.Bucket)); } return(results); }
/// <summary> /// Adds terms and frequencies found in vector into the <see cref="T:IDictionary{string, Int}"/> <paramref name="termFreqMap"/> /// </summary> /// <param name="termFreqMap"> a <see cref="T:IDictionary{string, Int}"/> of terms and their frequencies </param> /// <param name="vector"> List of terms and their frequencies for a doc/field </param> private void AddTermFrequencies(IDictionary <string, Int32> termFreqMap, Terms vector) { var termsEnum = vector.GetIterator(null); var spare = new CharsRef(); BytesRef text; while ((text = termsEnum.Next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); var term = spare.ToString(); if (IsNoiseWord(term)) { continue; } var freq = (int)termsEnum.TotalTermFreq; // increment frequency Int32 cnt; if (!termFreqMap.TryGetValue(term, out cnt)) { cnt = new Int32(); termFreqMap[term] = cnt; cnt.x = freq; } else { cnt.x += freq; } } }
public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap <string> stemOverrideDict) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords)); this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable)); #pragma warning disable 612, 618 if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { this.stemdict = null; this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap <string> .EntryIterator iter = (CharArrayMap <string> .EntryIterator)stemOverrideDict.EntrySet().GetEnumerator(); CharsRef spare = new CharsRef(); while (iter.HasNext) { char[] nextKey = iter.NextKey(); spare.CopyChars(nextKey, 0, nextKey.Length); builder.Add(new string(spare.Chars), iter.CurrentValue); } try { this.stemdict = builder.Build(); } catch (IOException ex) { throw new Exception("can not build stem dict", ex); } } }
public override void Reset() { input.Reset(); buffer.Reset(input); replacement = null; inputOff = 0; }
/// <summary> /// Decompress the <see cref="byte"/> array previously returned by /// <see cref="CompressString(string)"/> back into a <see cref="string"/> /// </summary> public static string DecompressString(byte[] value, int offset, int length) { byte[] bytes = Decompress(value, offset, length); CharsRef result = new CharsRef(bytes.Length); UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.Length, result); return(new string(result.Chars, 0, result.Length)); }
public override void Reset() { // LUCENENET: reset the BufferedCharFilter. _input.Reset(); buffer.Reset(_input); replacement = null; inputOff = 0; }
public override IBits ReadLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context) { if (Debugging.AssertsEnabled) { Debugging.Assert(info.HasDeletions); } var scratch = new BytesRef(); var scratchUtf16 = new CharsRef(); var fileName = IndexFileNames.FileNameFromGeneration(info.Info.Name, LIVEDOCS_EXTENSION, info.DelGen); ChecksumIndexInput input = null; var success = false; try { input = dir.OpenChecksumInput(fileName, context); SimpleTextUtil.ReadLine(input, scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(scratch, SIZE)); } var size = ParseInt32At(scratch, SIZE.Length, scratchUtf16); var bits = new BitSet(size); SimpleTextUtil.ReadLine(input, scratch); while (!scratch.Equals(END)) { if (Debugging.AssertsEnabled) { Debugging.Assert(StringHelper.StartsWith(scratch, DOC)); } var docid = ParseInt32At(scratch, DOC.Length, scratchUtf16); bits.Set(docid); SimpleTextUtil.ReadLine(input, scratch); } SimpleTextUtil.CheckFooter(input); success = true; return(new SimpleTextBits(bits, size)); } finally { if (success) { IOUtils.Dispose(input); } else { IOUtils.DisposeWhileHandlingException(input); } } }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new ArgumentOutOfRangeException(nameof(numInputWords), "numInputWords must be > 0 (got " + numInputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (input.Length <= 0) { throw new ArgumentOutOfRangeException(nameof(input.Length), "input.Length must be > 0 (got " + input.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (numOutputWords <= 0) { throw new ArgumentOutOfRangeException(nameof(numOutputWords), "numOutputWords must be > 0 (got " + numOutputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (output.Length <= 0) { throw new ArgumentOutOfRangeException(nameof(output.Length), "output.Length must be > 0 (got " + output.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (Debugging.AssertsEnabled) { Debugging.Assert(!HasHoles(input), "input has holes: {0}", input); Debugging.Assert(!HasHoles(output), "output has holes: {0}", output); } //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch); // lookup in hash int ord = words.Add(utf8Scratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; //System.out.println(" output=" + output + " old ord=" + ord); } else { //System.out.println(" output=" + output + " new ord=" + ord); } if (!workingSet.TryGetValue(input, out MapEntry e) || e is null) { e = new MapEntry(); workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map } e.ords.Add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords); }
public override bool IncrementToken() { if (buffer != null && buffer.Count > 0) { CharsRef nextStem = buffer[0]; buffer.RemoveAt(0); RestoreState(savedState); posIncAtt.PositionIncrement = 0; termAtt.SetEmpty().Append(nextStem); return(true); } if (!m_input.IncrementToken()) { return(false); } if (keywordAtt.IsKeyword) { return(true); } buffer = new JCG.List <CharsRef>(dedup ? stemmer.UniqueStems(termAtt.Buffer, termAtt.Length) : stemmer.Stem(termAtt.Buffer, termAtt.Length)); if (buffer.Count == 0) // we do not know this word, return it unchanged { return(true); } if (longestOnly && buffer.Count > 1) { buffer.Sort(lengthComparer); } CharsRef stem = buffer[0]; buffer.RemoveAt(0); termAtt.SetEmpty().Append(stem); if (longestOnly) { buffer.Clear(); } else { if (buffer.Count > 0) { savedState = CaptureState(); } } return(true); }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new System.ArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); } if (input.Length <= 0) { throw new System.ArgumentException("input.length must be > 0 (got " + input.Length + ")"); } if (numOutputWords <= 0) { throw new System.ArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")"); } if (output.Length <= 0) { throw new System.ArgumentException("output.length must be > 0 (got " + output.Length + ")"); } Debug.Assert(!HasHoles(input), "input has holes: " + input); Debug.Assert(!HasHoles(output), "output has holes: " + output); //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch); // lookup in hash int ord = words.Add(utf8Scratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; //System.out.println(" output=" + output + " old ord=" + ord); } else { //System.out.println(" output=" + output + " new ord=" + ord); } MapEntry e = workingSet.ContainsKey(input) ? workingSet[input] : null; if (e == null) { e = new MapEntry(); workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map } e.ords.Add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords); }
public virtual CharsRef PullNext() { if (Debugging.AssertsEnabled) Debugging.Assert(upto < count); lastEndOffset = endOffsets[upto]; lastPosLength = posLengths[upto]; CharsRef result = outputs[upto++]; posIncr = 0; if (upto == count) { Reset(); } return result; }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (buffer != null && buffer.Count > 0) { CharsRef nextStem = buffer.Remove(0); restoreState(savedState); posIncAtt.PositionIncrement = 0; termAtt.setEmpty().append(nextStem); return(true); } if (!input.incrementToken()) { return(false); } if (keywordAtt.Keyword) { return(true); } buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length()); if (buffer.Count == 0) // we do not know this word, return it unchanged { return(true); } if (longestOnly && buffer.Count > 1) { buffer.Sort(lengthComparator); } CharsRef stem = buffer.Remove(0); termAtt.setEmpty().append(stem); if (longestOnly) { buffer.Clear(); } else { if (buffer.Count > 0) { savedState = captureState(); } } return(true); }
private CharsRef ParseSynonym(string line, CharsRef reuse) { if (reuse == null) { reuse = new CharsRef(8); } int start = line.IndexOf('\'') + 1; int end = line.LastIndexOf('\''); string text = line.Substring(start, end - start).Replace("''", "'"); return(Analyze(text, reuse)); }
internal virtual int CountWords(CharsRef chars) { int wordCount = 1; int upto = chars.Offset; int limit = chars.Offset + chars.Length; while (upto < limit) { if (chars.Chars[upto++] == SynonymMap.WORD_SEPARATOR) { wordCount++; } } return(wordCount); }
private void Add(string input, string output, bool keepOrig) { if (VERBOSE) { Console.WriteLine(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); } CharsRef inputCharsRef = new CharsRef(); SynonymMap.Builder.Join(space.Split(input), inputCharsRef); CharsRef outputCharsRef = new CharsRef(); SynonymMap.Builder.Join(space.Split(output), outputCharsRef); b.Add(inputCharsRef, outputCharsRef, keepOrig); }
/// <summary> /// Sugar: analyzes the text with the analyzer and /// separates by <see cref="SynonymMap.WORD_SEPARATOR"/>. /// reuse and its chars must not be null. /// </summary> public virtual CharsRef Analyze(string text, CharsRef reuse) { IOException priorException = null; TokenStream ts = analyzer.GetTokenStream("", text); try { var termAtt = ts.AddAttribute <ICharTermAttribute>(); var posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); reuse.Length = 0; while (ts.IncrementToken()) { int length = termAtt.Length; if (length == 0) { throw new System.ArgumentException("term: " + text + " analyzed to a zero-length token"); } if (posIncAtt.PositionIncrement != 1) { throw new System.ArgumentException("term: " + text + " analyzed to a token with posinc != 1"); } reuse.Grow(reuse.Length + length + 1); // current + word + separator int end = reuse.Offset + reuse.Length; if (reuse.Length > 0) { reuse.Chars[end++] = SynonymMap.WORD_SEPARATOR; reuse.Length++; } Array.Copy(termAtt.Buffer, 0, reuse.Chars, end, length); reuse.Length += length; } ts.End(); } catch (IOException e) { priorException = e; } finally { IOUtils.CloseWhileHandlingException(priorException, ts); } if (reuse.Length == 0) { throw new System.ArgumentException("term: " + text + " was completely eliminated by analyzer"); } return(reuse); }
public override void Parse(TextReader @in) { int lineNumber = 0; TextReader br = @in; try { string line = null; string lastSynSetID = ""; CharsRef[] synset = new CharsRef[8]; int synsetSize = 0; while ((line = br.ReadLine()) != null) { lineNumber++; string synSetID = line.Substring(2, 9); if (!synSetID.Equals(lastSynSetID, StringComparison.Ordinal)) { AddInternal(synset, synsetSize); synsetSize = 0; } if (synset.Length <= synsetSize + 1) { CharsRef[] larger = new CharsRef[synset.Length * 2]; Array.Copy(synset, 0, larger, 0, synsetSize); synset = larger; } synset[synsetSize] = ParseSynonym(line, synset[synsetSize]); synsetSize++; lastSynSetID = synSetID; } // final synset in the file AddInternal(synset, synsetSize); } catch (ArgumentException e) { throw new Exception("Invalid synonym rule at line " + lineNumber.ToString(), e); } finally { br.Dispose(); } }
public virtual void TestAppend() { CharsRef @ref = new CharsRef(); StringBuilder builder = new StringBuilder(); int numStrings = AtLeast(10); for (int i = 0; i < numStrings; i++) { char[] charArray = TestUtil.RandomRealisticUnicodeString(Random(), 1, 100).ToCharArray(); int offset = Random().Next(charArray.Length); int length = charArray.Length - offset; builder.Append(charArray, offset, length); @ref.Append(charArray, offset, length); } Assert.AreEqual(builder.ToString(), @ref.ToString()); }
public virtual CharsRef pullNext() { Debug.Assert(upto < count); lastEndOffset = endOffsets[upto]; lastPosLength = posLengths[upto]; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.CharsRef result = outputs[upto++]; CharsRef result = outputs[upto++]; posIncr = 0; if (upto == count) { reset(); } return(result); }
private void Add(string input, string output, bool keepOrig) { if (VERBOSE) { Console.WriteLine(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); } CharsRef inputCharsRef = new CharsRef(); SynonymMap.Builder.Join(input.Split(new string[] { " +" }, StringSplitOptions.RemoveEmptyEntries), inputCharsRef); CharsRef outputCharsRef = new CharsRef(); SynonymMap.Builder.Join(output.Split(new string[] { " +" }, StringSplitOptions.RemoveEmptyEntries), outputCharsRef); b.Add(inputCharsRef, outputCharsRef, keepOrig); }
public override void Parse(TextReader @in) { int lineNumber = 0; TextReader br = @in; try { string line = null; string lastSynSetID = ""; CharsRef[] synset = new CharsRef[8]; int synsetSize = 0; while ((line = br.ReadLine()) != null) { lineNumber++; string synSetID = line.Substring(2, 9); if (!synSetID.Equals(lastSynSetID)) { AddInternal(synset, synsetSize); synsetSize = 0; } if (synset.Length <= synsetSize + 1) { CharsRef[] larger = new CharsRef[synset.Length * 2]; Array.Copy(synset, 0, larger, 0, synsetSize); synset = larger; } synset[synsetSize] = ParseSynonym(line, synset[synsetSize]); synsetSize++; lastSynSetID = synSetID; } // final synset in the file AddInternal(synset, synsetSize); } catch (System.ArgumentException e) { throw new Exception("Invalid synonym rule at line " + lineNumber.ToString(), e); } finally { br.Dispose(); } }
public override void Parse(Reader @in) { LineNumberReader br = new LineNumberReader(@in); try { string line = null; string lastSynSetID = ""; CharsRef[] synset = new CharsRef[8]; int synsetSize = 0; while ((line = br.readLine()) != null) { string synSetID = line.Substring(2, 9); if (!synSetID.Equals(lastSynSetID)) { addInternal(synset, synsetSize); synsetSize = 0; } if (synset.Length <= synsetSize+1) { CharsRef[] larger = new CharsRef[synset.Length * 2]; Array.Copy(synset, 0, larger, 0, synsetSize); synset = larger; } synset[synsetSize] = parseSynonym(line, synset[synsetSize]); synsetSize++; lastSynSetID = synSetID; } // final synset in the file addInternal(synset, synsetSize); } catch (System.ArgumentException e) { ParseException ex = new ParseException("Invalid synonym rule at line " + br.LineNumber, 0); ex.initCause(e); throw ex; } finally { br.close(); } }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new System.ArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); } if (input.Length <= 0) { throw new System.ArgumentException("input.length must be > 0 (got " + input.Length + ")"); } if (numOutputWords <= 0) { throw new System.ArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")"); } if (output.Length <= 0) { throw new System.ArgumentException("output.length must be > 0 (got " + output.Length + ")"); } Debug.Assert(!HasHoles(input), "input has holes: " + input); Debug.Assert(!HasHoles(output), "output has holes: " + output); //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch); // lookup in hash int ord = words.Add(utf8Scratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; //System.out.println(" output=" + output + " old ord=" + ord); } else { //System.out.println(" output=" + output + " new ord=" + ord); } MapEntry e = workingSet[input]; if (e == null) { e = new MapEntry(); workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map } e.ords.Add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords); }
/// <summary> /// Encode characters from a char[] source, starting at /// offset for length chars. After encoding, result.offset will always be 0. /// </summary> // TODO: broken if incoming result.offset != 0 public static void UTF16toUTF8(CharsRef source, int offset, int length, BytesRef result) { int upto = 0; int i = offset; int end = offset + length; var @out = result.Bytes; // Pre-allocate for worst case 4-for-1 int maxLen = length * 4; if (@out.Length < maxLen) { @out = result.Bytes = new byte[maxLen]; } result.Offset = 0; while (i < end) { int code = (int)source.CharAt(i++); if (code < 0x80) { @out[upto++] = (byte)code; } else if (code < 0x800) { @out[upto++] = (byte)(0xC0 | (code >> 6)); @out[upto++] = (byte)(0x80 | (code & 0x3F)); } else if (code < 0xD800 || code > 0xDFFF) { @out[upto++] = (byte)(0xE0 | (code >> 12)); @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); @out[upto++] = (byte)(0x80 | (code & 0x3F)); } else { // surrogate pair // confirm valid high surrogate if (code < 0xDC00 && i < end) { var utf32 = (int)source.CharAt(i); // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { utf32 = (code << 10) + utf32 + SURROGATE_OFFSET; i++; @out[upto++] = (byte)(0xF0 | (utf32 >> 18)); @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); @out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); continue; } } // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = unchecked((byte)0xEF); @out[upto++] = unchecked((byte)0xBF); @out[upto++] = unchecked((byte)0xBD); } } //assert matches(source, offset, length, out, upto); result.Length = upto; }
private void AddInternal(CharsRef[] synset, int size) { if (size <= 1) { return; // nothing to do } if (expand) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { Add(synset[i], synset[j], false); } } } else { for (int i = 0; i < size; i++) { Add(synset[i], synset[0], false); } } }
/// <summary> /// Interprets the given byte array as UTF-8 and converts to UTF-16. The <seealso cref="CharsRef"/> will be extended if /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. /// <p> /// NOTE: Full characters are read, even if this reads past the length passed (and /// can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// </summary> // TODO: broken if chars.offset != 0 public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) { int out_offset = chars.Offset = 0; char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length); int limit = offset + length; while (offset < limit) { int b = ((sbyte)utf8[offset++]) & 0xff; if (b < 0xc0) { Debug.Assert(b < 0x80); @out[out_offset++] = (char)b; } else if (b < 0xe0) { @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); } else if (b < 0xf0) { @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); offset += 2; } else { Debug.Assert(b < 0xf8, "b = 0x" + b.ToString("x")); int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); offset += 3; if (ch < UNI_MAX_BMP) { @out[out_offset++] = (char)ch; } else { int chHalf = ch - 0x0010000; @out[out_offset++] = (char)((chHalf >> 10) + 0xD800); @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00); } } } chars.Length = out_offset - chars.Offset; }
private CharsRef ParseSynonym(string line, CharsRef reuse) { if (reuse == null) { reuse = new CharsRef(8); } int start = line.IndexOf('\'') + 1; int end = line.LastIndexOf('\''); string text = line.Substring(start, end - start).Replace("''", "'"); return Analyze(text, reuse); }
/// <summary> /// Utility method for <seealso cref="#UTF8toUTF16(byte[], int, int, CharsRef)"/> </summary> /// <seealso cref= #UTF8toUTF16(byte[], int, int, CharsRef) </seealso> public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars); }
public override List<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool higherWeightsFirst, int num) { if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } IList<FSTCompletion.Completion> completions; if (higherWeightsFirst) { completions = higherWeightsCompletion.DoLookup(key, num); } else { completions = normalCompletion.DoLookup(key, num); } List<LookupResult> results = new List<LookupResult>(completions.Count); CharsRef spare = new CharsRef(); foreach (FSTCompletion.Completion c in completions) { spare.Grow(c.utf8.Length); UnicodeUtil.UTF8toUTF16(c.utf8, spare); results.Add(new LookupResult(spare.ToString(), c.bucket)); } return results; }
public virtual void TestAllUnicodeChars() { BytesRef utf8 = new BytesRef(10); CharsRef utf16 = new CharsRef(10); char[] chars = new char[2]; for (int ch = 0; ch < 0x0010FFFF; ch++) { if (ch == 0xd800) // Skip invalid code points { ch = 0xe000; } int len = 0; if (ch <= 0xffff) { chars[len++] = (char)ch; } else { chars[len++] = (char)(((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START); chars[len++] = (char)(((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START); } UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8); string s1 = new string(chars, 0, len); string s2 = Encoding.UTF8.GetString(utf8.Bytes, utf8.Offset, utf8.Length); Assert.AreEqual(s1, s2, "codepoint " + ch); UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); Assert.AreEqual(s1, new string(utf16.Chars, 0, utf16.Length), "codepoint " + ch); var b = s1.GetBytes(Encoding.UTF8); Assert.AreEqual(utf8.Length, b.Length); for (int j = 0; j < utf8.Length; j++) { Assert.AreEqual(utf8.Bytes[j], b[j]); } } }
public virtual void Add(char[] output, int offset, int len, int endOffset, int posLength) { if (count == outputs.Length) { CharsRef[] next = new CharsRef[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; Array.Copy(outputs, 0, next, 0, count); outputs = next; } if (count == endOffsets.Length) { int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT)]; Array.Copy(endOffsets, 0, next, 0, count); endOffsets = next; } if (count == posLengths.Length) { int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT)]; Array.Copy(posLengths, 0, next, 0, count); posLengths = next; } if (outputs[count] == null) { outputs[count] = new CharsRef(); } outputs[count].CopyChars(output, offset, len); // endOffset can be -1, in which case we should simply // use the endOffset of the input token, or X >= 0, in // which case we use X as the endOffset for this output endOffsets[count] = endOffset; posLengths[count] = posLength; count++; }
public override void Parse(TextReader @in) { int lineNumber = 0; try { string line = null; while ((line = @in.ReadLine()) != null) { lineNumber++; if (line.Length == 0 || line[0] == '#') { continue; // ignore empty lines and comments } CharsRef[] inputs; CharsRef[] outputs; // TODO: we could process this more efficiently. string[] sides = Split(line, "=>"); if (sides.Length > 1) // explicit mapping { if (sides.Length != 2) { throw new System.ArgumentException("more than one explicit mapping specified on the same line"); } string[] inputStrings = Split(sides[0], ","); inputs = new CharsRef[inputStrings.Length]; for (int i = 0; i < inputs.Length; i++) { inputs[i] = Analyze(Unescape(inputStrings[i]).Trim(), new CharsRef()); } string[] outputStrings = Split(sides[1], ","); outputs = new CharsRef[outputStrings.Length]; for (int i = 0; i < outputs.Length; i++) { outputs[i] = Analyze(Unescape(outputStrings[i]).Trim(), new CharsRef()); } } else { string[] inputStrings = Split(line, ","); inputs = new CharsRef[inputStrings.Length]; for (int i = 0; i < inputs.Length; i++) { inputs[i] = Analyze(Unescape(inputStrings[i]).Trim(), new CharsRef()); } if (expand) { outputs = inputs; } else { outputs = new CharsRef[1]; outputs[0] = inputs[0]; } } // currently we include the term itself in the map, // and use includeOrig = false always. // this is how the existing filter does it, but its actually a bug, // especially if combined with ignoreCase = true for (int i = 0; i < inputs.Length; i++) { for (int j = 0; j < outputs.Length; j++) { Add(inputs[i], outputs[j], false); } } } } catch (System.ArgumentException e) { throw new Exception("Invalid synonym rule at line " + lineNumber, e); //ex.initCause(e); //throw ex; } finally { @in.Dispose(); } }
/// <summary> /// Decompress the byte array previously returned by /// compressString back into a String /// </summary> public static string DecompressString(byte[] value, int offset, int length) { byte[] bytes = Decompress(value, offset, length); CharsRef result = new CharsRef(bytes.Length); UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.Length, result); return new string(result.Chars, 0, result.Length); }
internal virtual int countWords(CharsRef chars) { int wordCount = 1; int upto = chars.Offset; int limit = chars.Offset + chars.Length; while (upto < limit) { if (chars.Chars[upto++] == SynonymMap.WORD_SEPARATOR) { wordCount++; } } return wordCount; }
/// <summary> /// Add a phrase->phrase synonym mapping. /// Phrases are character sequences where words are /// separated with character zero (U+0000). Empty words /// (two U+0000s in a row) are not allowed in the input nor /// the output! /// </summary> /// <param name="input"> input phrase </param> /// <param name="output"> output phrase </param> /// <param name="includeOrig"> true if the original should be included </param> public virtual void Add(CharsRef input, CharsRef output, bool includeOrig) { Add(input, countWords(input), output, countWords(output), includeOrig); }
/// <summary> /// Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the /// given selection of fields from terms with a document frequency greater than /// the given maxDocFreq /// </summary> /// <param name="matchVersion"> Version to be used in <seealso cref="StopFilter"/> </param> /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param> /// <param name="indexReader"> IndexReader to identify the stopwords from </param> /// <param name="fields"> Selection of fields to calculate stopwords for </param> /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param> /// <exception cref="IOException"> Can be thrown while reading from the IndexReader </exception> public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection<string> fields, int maxDocFreq) : base(@delegate.Strategy) { this.matchVersion = matchVersion; this.@delegate = @delegate; foreach (string field in fields) { var stopWords = new HashSet<string>(); Terms terms = MultiFields.GetTerms(indexReader, field); CharsRef spare = new CharsRef(); if (terms != null) { TermsEnum te = terms.Iterator(null); BytesRef text; while ((text = te.Next()) != null) { if (te.DocFreq() > maxDocFreq) { UnicodeUtil.UTF8toUTF16(text, spare); stopWords.Add(spare.ToString()); } } } stopWordsPerField[field] = stopWords; } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private void addInternal(java.io.BufferedReader in) throws java.io.IOException private void addInternal(BufferedReader @in) { string line = null; while ((line = @in.readLine()) != null) { if (line.Length == 0 || line[0] == '#') { continue; // ignore empty lines and comments } CharsRef[] inputs; CharsRef[] outputs; // TODO: we could process this more efficiently. string[] sides = Split(line, "=>"); if (sides.Length > 1) // explicit mapping { if (sides.Length != 2) { throw new System.ArgumentException("more than one explicit mapping specified on the same line"); } string[] inputStrings = Split(sides[0], ","); inputs = new CharsRef[inputStrings.Length]; for (int i = 0; i < inputs.Length; i++) { inputs[i] = analyze(unescape(inputStrings[i]).Trim(), new CharsRef()); } string[] outputStrings = Split(sides[1], ","); outputs = new CharsRef[outputStrings.Length]; for (int i = 0; i < outputs.Length; i++) { outputs[i] = analyze(unescape(outputStrings[i]).Trim(), new CharsRef()); } } else { string[] inputStrings = Split(line, ","); inputs = new CharsRef[inputStrings.Length]; for (int i = 0; i < inputs.Length; i++) { inputs[i] = analyze(unescape(inputStrings[i]).Trim(), new CharsRef()); } if (expand) { outputs = inputs; } else { outputs = new CharsRef[1]; outputs[0] = inputs[0]; } } // currently we include the term itself in the map, // and use includeOrig = false always. // this is how the existing filter does it, but its actually a bug, // especially if combined with ignoreCase = true for (int i = 0; i < inputs.Length; i++) { for (int j = 0; j < outputs.Length; j++) { add(inputs[i], outputs[j], false); } } } }
/// <summary> /// only used for asserting! /// </summary> internal virtual bool HasHoles(CharsRef chars) { int end = chars.Offset + chars.Length; for (int idx = chars.Offset + 1;idx < end;idx++) { if (chars.Chars[idx] == SynonymMap.WORD_SEPARATOR && chars.Chars[idx - 1] == SynonymMap.WORD_SEPARATOR) { return true; } } if (chars.Chars[chars.Offset] == '\u0000') { return true; } if (chars.Chars[chars.Offset + chars.Length - 1] == '\u0000') { return true; } return false; }
public virtual void TestRandomUnicodeStrings() { char[] buffer = new char[20]; char[] expected = new char[20]; BytesRef utf8 = new BytesRef(20); CharsRef utf16 = new CharsRef(20); int num = AtLeast(100000); for (int iter = 0; iter < num; iter++) { bool hasIllegal = FillUnicode(buffer, expected, 0, 20); UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8); if (!hasIllegal) { var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8); Assert.AreEqual(b.Length, utf8.Length); for (int i = 0; i < b.Length; i++) { Assert.AreEqual(b[i], utf8.Bytes[i]); } } UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); Assert.AreEqual(utf16.Length, 20); for (int i = 0; i < 20; i++) { Assert.AreEqual(expected[i], utf16.Chars[i]); } } }
public virtual void TestUTF8UTF16CharsRef() { int num = AtLeast(3989); for (int i = 0; i < num; i++) { string unicode = TestUtil.RandomRealisticUnicodeString(Random()); BytesRef @ref = new BytesRef(unicode); char[] arr = new char[1 + Random().Next(100)]; int offset = Random().Next(arr.Length); int len = Random().Next(arr.Length - offset); CharsRef cRef = new CharsRef(arr, offset, len); UnicodeUtil.UTF8toUTF16(@ref, cRef); Assert.AreEqual(cRef.ToString(), unicode); } }
public override void Build(IInputIterator tfit) { if (tfit.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (tfit.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } root = new TernaryTreeNode(); // buffer first #pragma warning disable 612, 618 if (tfit.Comparator != BytesRef.UTF8SortedAsUTF16Comparer) { // make sure it's sorted and the comparator uses UTF16 sort order tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparer); } #pragma warning restore 612, 618 List<string> tokens = new List<string>(); List<object> vals = new List<object>(); // LUCENENET TODO: Should this be long? in Java it was Number, but we can probably do better than object BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.Next()) != null) { charsSpare.Grow(spare.Length); UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare); tokens.Add(charsSpare.ToString()); vals.Add(Convert.ToInt64(tfit.Weight)); } autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root); }
public override int Read() { //System.out.println("\nread"); while (true) { if (replacement != null && replacementPointer < replacement.Length) { //System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]); return replacement.chars[replacement.offset + replacementPointer++]; } // TODO: a more efficient approach would be Aho/Corasick's // algorithm // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps // // I think this would be (almost?) equivalent to 1) adding // epsilon arcs from all final nodes back to the init // node in the FST, 2) adding a .* (skip any char) // loop on the initial node, and 3) determinizing // that. Then we would not have to restart matching // at each position. int lastMatchLen = -1; CharsRef lastMatch = null; int firstCH = buffer.Get(inputOff); if (firstCH != -1) { FST.Arc<CharsRef> arc = cachedRootArcs[Convert.ToChar((char) firstCH)]; if (arc != null) { if (!FST.TargetHasArcs(arc)) { // Fast pass for single character match: Debug.Assert(arc.Final); lastMatchLen = 1; lastMatch = arc.Output; } else { int lookahead = 0; CharsRef output = arc.Output; while (true) { lookahead++; if (arc.Final) { // Match! (to node is final) lastMatchLen = lookahead; lastMatch = outputs.Add(output, arc.NextFinalOutput); // Greedy: keep searching to see if there's a // longer match... } if (!FST.TargetHasArcs(arc)) { break; } int ch = buffer.Get(inputOff + lookahead); if (ch == -1) { break; } if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) == null) { // Dead end break; } output = outputs.Add(output, arc.Output); } } } } if (lastMatch != null) { inputOff += lastMatchLen; //System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch); int diff = lastMatchLen - lastMatch.Length; if (diff != 0) { int prevCumulativeDiff = LastCumulativeDiff; if (diff > 0) { // Replacement is shorter than matched input: AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff); } else { // Replacement is longer than matched input: remap // the "extra" chars all back to the same input // offset: //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int outputStart = inputOff - prevCumulativeDiff; int outputStart = inputOff - prevCumulativeDiff; for (int extraIDX = 0;extraIDX < -diff;extraIDX++) { AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1); } } } replacement = lastMatch; replacementPointer = 0; } else { int ret = buffer.Get(inputOff); if (ret != -1) { inputOff++; buffer.FreeBefore(inputOff); } return ret; } } }
/// <summary> /// Sugar: analyzes the text with the analyzer and /// separates by <seealso cref="SynonymMap#WORD_SEPARATOR"/>. /// reuse and its chars must not be null. /// </summary> public virtual CharsRef Analyze(string text, CharsRef reuse) { IOException priorException = null; TokenStream ts = analyzer.TokenStream("", text); try { var termAtt = ts.AddAttribute < ICharTermAttribute>(); var posIncAtt = ts.AddAttribute < IPositionIncrementAttribute>(); ts.Reset(); reuse.Length = 0; while (ts.IncrementToken()) { int length = termAtt.Length; if (length == 0) { throw new System.ArgumentException("term: " + text + " analyzed to a zero-length token"); } if (posIncAtt.PositionIncrement != 1) { throw new System.ArgumentException("term: " + text + " analyzed to a token with posinc != 1"); } reuse.Grow(reuse.Length + length + 1); // current + word + separator int end = reuse.Offset + reuse.Length; if (reuse.Length > 0) { reuse.Chars[end++] = SynonymMap.WORD_SEPARATOR; reuse.Length++; } Array.Copy(termAtt.Buffer(), 0, reuse.Chars, end, length); reuse.Length += length; } ts.End(); } catch (IOException e) { priorException = e; } finally { IOUtils.CloseWhileHandlingException(priorException, ts); } if (reuse.Length == 0) { throw new System.ArgumentException("term: " + text + " was completely eliminated by analyzer"); } return reuse; }
/// <summary> /// Sugar: just joins the provided terms with {@link /// SynonymMap#WORD_SEPARATOR}. reuse and its chars /// must not be null. /// </summary> public static CharsRef join(string[] words, CharsRef reuse) { int upto = 0; char[] buffer = reuse.Chars; foreach (string word in words) { int wordLen = word.Length; int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR if (needed > buffer.Length) { reuse.Grow(needed); buffer = reuse.Chars; } if (upto > 0) { buffer[upto++] = SynonymMap.WORD_SEPARATOR; } word.CopyTo(0, buffer, upto, wordLen - 0); upto += wordLen; } reuse.Length = upto; return reuse; }