示例#1
0
        public virtual void TestMaxPosition3WithSynomyms()
        {
            foreach (bool consumeAll in new bool[] { true, false })
            {
                MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
                // if we are consuming all tokens, we can use the checks, otherwise we can't
                tokenizer.EnableChecks = consumeAll;

                SynonymMap.Builder builder = new SynonymMap.Builder(true);
                builder.Add(new CharsRef("one"), new CharsRef("first"), true);
                builder.Add(new CharsRef("one"), new CharsRef("alpha"), true);
                builder.Add(new CharsRef("one"), new CharsRef("beguine"), true);
                CharsRef multiWordCharsRef = new CharsRef();
                SynonymMap.Builder.Join(new string[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
                builder.Add(new CharsRef("one"), multiWordCharsRef, true);
                SynonymMap.Builder.Join(new string[] { "dopple", "ganger" }, multiWordCharsRef);
                builder.Add(new CharsRef("two"), multiWordCharsRef, true);
                SynonymMap  synonymMap = builder.Build();
                TokenStream stream     = new SynonymFilter(tokenizer, synonymMap, true);
                stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

                // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
                AssertTokenStreamContents(stream, new string[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" }, new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
            }
        }
示例#2
0
        public override void Build(IInputIterator tfit)
        {
            if (tfit.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (tfit.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            root = new TernaryTreeNode();
            // buffer first
#pragma warning disable 612, 618
            if (tfit.Comparer != BytesRef.UTF8SortedAsUTF16Comparer)
            {
                // make sure it's sorted and the comparer uses UTF16 sort order
                tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparer);
            }
#pragma warning restore 612, 618

            List <string> tokens = new List <string>();
            List <object> vals   = new List <object>();
            BytesRef      spare;
            CharsRef      charsSpare = new CharsRef();
            while ((spare = tfit.Next()) != null)
            {
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                tokens.Add(charsSpare.ToString());
                vals.Add(Convert.ToInt64(tfit.Weight));
            }
            autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root);
        }
示例#3
0
        /// <summary>
        /// Creates a new <see cref="QueryAutoStopWordAnalyzer"/> with stopwords calculated for the
        /// given selection of fields from terms with a document frequency greater than
        /// the given <paramref name="maxDocFreq"/>
        /// </summary>
        /// <param name="matchVersion"> Version to be used in <see cref="StopFilter"/> </param>
        /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param>
        /// <param name="indexReader"> <see cref="IndexReader"/> to identify the stopwords from </param>
        /// <param name="fields"> Selection of fields to calculate stopwords for </param>
        /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param>
        /// <exception cref="IOException"> Can be thrown while reading from the <see cref="IndexReader"/> </exception>
        public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection <string> fields, int maxDocFreq)
            : base(@delegate.Strategy)
        {
            this.matchVersion = matchVersion;
            this.@delegate    = @delegate;

            foreach (string field in fields)
            {
                var      stopWords = new JCG.HashSet <string>();
                Terms    terms     = MultiFields.GetTerms(indexReader, field);
                CharsRef spare     = new CharsRef();
                if (terms != null)
                {
                    TermsEnum te = terms.GetEnumerator();
                    while (te.MoveNext())
                    {
                        if (te.DocFreq > maxDocFreq)
                        {
                            UnicodeUtil.UTF8toUTF16(te.Term, spare);
                            stopWords.Add(spare.ToString());
                        }
                    }
                }
                stopWordsPerField[field] = stopWords;
            }
        }
示例#4
0
 /// <summary>
 /// Copy <paramref name="current"/> into an internal buffer.
 /// </summary>
 private bool SetPrevious(CharsRef current)
 {
     // don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277
     // still, called only from assert
     previous = CharsRef.DeepCopyOf(current);
     return(true);
 }
示例#5
0
            /// <summary>
            /// Sugar: just joins the provided terms with
            /// <see cref="SynonymMap.WORD_SEPARATOR"/>. reuse and its chars
            /// must not be null.
            /// </summary>
            public static CharsRef Join(string[] words, CharsRef reuse)
            {
                int upto = 0;

                char[] buffer = reuse.Chars;
                foreach (string word in words)
                {
                    int wordLen = word.Length;
                    int needed  = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
                    if (needed > buffer.Length)
                    {
                        reuse.Grow(needed);
                        buffer = reuse.Chars;
                    }
                    if (upto > 0)
                    {
                        buffer[upto++] = SynonymMap.WORD_SEPARATOR;
                    }

                    word.CopyTo(0, buffer, upto, wordLen - 0);
                    upto += wordLen;
                }
                reuse.Length = upto;
                return(reuse);
            }
        public virtual void TestMaxPosition3WithSynomyms()
        {
            foreach (bool consumeAll in new bool[] { true, false })
            {
                MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
                // if we are consuming all tokens, we can use the checks, otherwise we can't
                tokenizer.EnableChecks = consumeAll;

                SynonymMap.Builder builder = new SynonymMap.Builder(true);
                builder.Add(new CharsRef("one"), new CharsRef("first"), true);
                builder.Add(new CharsRef("one"), new CharsRef("alpha"), true);
                builder.Add(new CharsRef("one"), new CharsRef("beguine"), true);
                CharsRef multiWordCharsRef = new CharsRef();
                SynonymMap.Builder.Join(new string[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
                builder.Add(new CharsRef("one"), multiWordCharsRef, true);
                SynonymMap.Builder.Join(new string[] { "dopple", "ganger" }, multiWordCharsRef);
                builder.Add(new CharsRef("two"), multiWordCharsRef, true);
                SynonymMap synonymMap = builder.Build();
                TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
                stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

                // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
                AssertTokenStreamContents(stream, new string[] { "one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger" }, new int[] { 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0 });
            }
        }
示例#7
0
        public virtual void TestRandomUnicodeStrings()
        {
            char[] buffer   = new char[20];
            char[] expected = new char[20];

            BytesRef utf8  = new BytesRef(20);
            CharsRef utf16 = new CharsRef(20);

            int num = AtLeast(100000);

            for (int iter = 0; iter < num; iter++)
            {
                bool hasIllegal = FillUnicode(buffer, expected, 0, 20);

                UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
                if (!hasIllegal)
                {
#pragma warning disable 612, 618
                    var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8);
#pragma warning restore 612, 618
                    Assert.AreEqual(b.Length, utf8.Length);
                    for (int i = 0; i < b.Length; i++)
                    {
                        Assert.AreEqual(b[i], utf8.Bytes[i]);
                    }
                }

                UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16);
                Assert.AreEqual(utf16.Length, 20);
                for (int i = 0; i < 20; i++)
                {
                    Assert.AreEqual(expected[i], utf16.Chars[i]);
                }
            }
        }
示例#8
0
        public virtual void TestCharSequenceCharAt()
        {
            CharsRef c = new CharsRef("abc");

            Assert.AreEqual('b', c.CharAt(1));

            try
            {
                c.CharAt(-1);
                Assert.Fail();
            }
            catch (System.IndexOutOfRangeException expected)
            {
                // expected exception
            }

            try
            {
                c.CharAt(3);
                Assert.Fail();
            }
            catch (System.IndexOutOfRangeException expected)
            {
                // expected exception
            }
        }
示例#9
0
 public virtual void TestAppendChars()
 {
     char[] chars = new char[] { 'a', 'b', 'c', 'd' };
     CharsRef c = new CharsRef(chars, 1, 3); // bcd
     c.Append(new char[] { 'e' }, 0, 1);
     Assert.AreEqual("bcde", c.ToString());
 }
示例#10
0
#pragma warning restore 612, 618

        /// <summary>
        /// Add another character sequence to this automaton. The sequence must be
        /// lexicographically larger or equal compared to any previous sequences added
        /// to this automaton (the input must be sorted).
        /// </summary>
        public void Add(CharsRef current)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(stateRegistry != null, "Automaton already built.");
                Debugging.Assert(previous == null || comparer.Compare(previous, current) <= 0, "Input must be in sorted UTF-8 order: {0} >= {1}", previous, current);
                Debugging.Assert(SetPrevious(current));
            }

            // Descend in the automaton (find matching prefix).
            int   pos = 0, max = current.Length;
            State next, state = root;

            while (pos < max && (next = state.LastChild(Character.CodePointAt(current, pos))) != null)
            {
                state = next;
                // todo, optimize me
                pos += Character.CharCount(Character.CodePointAt(current, pos));
            }

            if (state.HasChildren)
            {
                ReplaceOrRegister(state);
            }

            AddSuffix(state, current, pos);
        }
示例#11
0
	  public override void Reset()
	  {
		input.Reset();
		buffer.reset(input);
		replacement = null;
		inputOff = 0;
	  }
示例#12
0
 public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap <string> stemOverrideDict)
 {
     this.matchVersion = matchVersion;
     this.stoptable    = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
     this.excltable    = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
     if (stemOverrideDict.Empty || !matchVersion.onOrAfter(Version.LUCENE_31))
     {
         this.stemdict     = null;
         this.origStemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict));
     }
     else
     {
         this.origStemdict = null;
         // we don't need to ignore case here since we lowercase in this analyzer anyway
         StemmerOverrideFilter.Builder        builder = new StemmerOverrideFilter.Builder(false);
         CharArrayMap <string> .EntryIterator iter    = stemOverrideDict.entrySet().GetEnumerator();
         CharsRef spare = new CharsRef();
         while (iter.hasNext())
         {
             char[] nextKey = iter.nextKey();
             spare.copyChars(nextKey, 0, nextKey.Length);
             builder.add(spare, iter.currentValue());
         }
         try
         {
             this.stemdict = builder.build();
         }
         catch (IOException ex)
         {
             throw new Exception("can not build stem dict", ex);
         }
     }
 }
示例#13
0
        private LookupResult GetLookupResult(long?output1, BytesRef output2, CharsRef spare)
        {
            LookupResult result;

            if (hasPayloads)
            {
                int sepIndex = -1;
                for (int i = 0; i < output2.Length; i++)
                {
                    if (output2.Bytes[output2.Offset + i] == PAYLOAD_SEP)
                    {
                        sepIndex = i;
                        break;
                    }
                }
                Debug.Assert(sepIndex != -1);
                spare.Grow(sepIndex);

                int payloadLen = output2.Length - sepIndex - 1;
                UnicodeUtil.UTF8toUTF16(output2.Bytes, output2.Offset, sepIndex, spare);
                BytesRef payload = new BytesRef(payloadLen);
                Array.Copy(output2.Bytes, sepIndex + 1, payload.Bytes, 0, payloadLen);
                payload.Length = payloadLen;
                result         = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()), payload);
            }
            else
            {
                spare.Grow(output2.Length);
                UnicodeUtil.UTF8toUTF16(output2, spare);
                result = new LookupResult(spare.ToString(), DecodeWeight(output1.GetValueOrDefault()));
            }

            return(result);
        }
示例#14
0
            /// <summary>
            /// Sugar: just joins the provided terms with {@link
            ///  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
            ///  must not be null.
            /// </summary>
            public static CharsRef join(string[] words, CharsRef reuse)
            {
                int upto = 0;

                char[] buffer = reuse.chars;
                foreach (string word in words)
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int wordLen = word.length();
                    int wordLen = word.Length;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int needed = (0 == upto ? wordLen : 1 + upto + wordLen);
                    int needed = (0 == upto ? wordLen : 1 + upto + wordLen);     // Add 1 for WORD_SEPARATOR
                    if (needed > buffer.Length)
                    {
                        reuse.grow(needed);
                        buffer = reuse.chars;
                    }
                    if (upto > 0)
                    {
                        buffer[upto++] = SynonymMap.WORD_SEPARATOR;
                    }

                    word.CopyTo(0, buffer, upto, wordLen - 0);
                    upto += wordLen;
                }
                reuse.length = upto;
                return(reuse);
            }
示例#15
0
 public virtual void Add(char[] output, int offset, int len, int endOffset, int posLength)
 {
     if (count == outputs.Length)
     {
         CharsRef[] next = new CharsRef[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
         Array.Copy(outputs, 0, next, 0, count);
         outputs = next;
     }
     if (count == endOffsets.Length)
     {
         int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT32)];
         Array.Copy(endOffsets, 0, next, 0, count);
         endOffsets = next;
     }
     if (count == posLengths.Length)
     {
         int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT32)];
         Array.Copy(posLengths, 0, next, 0, count);
         posLengths = next;
     }
     if (outputs[count] == null)
     {
         outputs[count] = new CharsRef();
     }
     outputs[count].CopyChars(output, offset, len);
     // endOffset can be -1, in which case we should simply
     // use the endOffset of the input token, or X >= 0, in
     // which case we use X as the endOffset for this output
     endOffsets[count] = endOffset;
     posLengths[count] = posLength;
     count++;
 }
示例#16
0
        public override void Build(InputIterator tfit)
        {
            if (tfit.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (tfit.Comparator != null)
            {
                // make sure it's unsorted
                // WTF - this could result in yet another sorted iteration....
                tfit = new UnsortedInputIterator(tfit);
            }
            if (tfit.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            count = 0;
            trie  = new JaspellTernarySearchTrie();
            trie.MatchAlmostDiff = editDistance;
            BytesRef spare;

            CharsRef charsSpare = new CharsRef();

            while ((spare = tfit.Next()) != null)
            {
                long weight = tfit.Weight;
                if (spare.Length == 0)
                {
                    continue;
                }
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                trie.Put(charsSpare.ToString(), Convert.ToInt64(weight));
            }
        }
示例#17
0
        public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool higherWeightsFirst, int num)
        {
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            IList <FSTCompletion.Completion> completions;

            if (higherWeightsFirst)
            {
                completions = higherWeightsCompletion.DoLookup(key, num);
            }
            else
            {
                completions = normalCompletion.DoLookup(key, num);
            }

            List <LookupResult> results = new List <LookupResult>(completions.Count);
            CharsRef            spare   = new CharsRef();

            foreach (FSTCompletion.Completion c in completions)
            {
                spare.Grow(c.Utf8.Length);
                UnicodeUtil.UTF8toUTF16(c.Utf8, spare);
                results.Add(new LookupResult(spare.ToString(), c.Bucket));
            }
            return(results);
        }
示例#18
0
        /// <summary>
        /// Adds terms and frequencies found in vector into the <see cref="T:IDictionary{string, Int}"/> <paramref name="termFreqMap"/>
        /// </summary>
        /// <param name="termFreqMap"> a <see cref="T:IDictionary{string, Int}"/> of terms and their frequencies </param>
        /// <param name="vector"> List of terms and their frequencies for a doc/field </param>
        private void AddTermFrequencies(IDictionary <string, Int32> termFreqMap, Terms vector)
        {
            var      termsEnum = vector.GetIterator(null);
            var      spare     = new CharsRef();
            BytesRef text;

            while ((text = termsEnum.Next()) != null)
            {
                UnicodeUtil.UTF8toUTF16(text, spare);
                var term = spare.ToString();
                if (IsNoiseWord(term))
                {
                    continue;
                }
                var freq = (int)termsEnum.TotalTermFreq;

                // increment frequency
                Int32 cnt;
                if (!termFreqMap.TryGetValue(term, out cnt))
                {
                    cnt = new Int32();
                    termFreqMap[term] = cnt;
                    cnt.x             = freq;
                }
                else
                {
                    cnt.x += freq;
                }
            }
        }
示例#19
0
        public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap <string> stemOverrideDict)
        {
            this.matchVersion = matchVersion;
            this.stoptable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords));
            this.excltable    = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable));
#pragma warning disable 612, 618
            if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                this.stemdict     = null;
                this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict));
            }
            else
            {
                this.origStemdict = null;
                // we don't need to ignore case here since we lowercase in this analyzer anyway
                StemmerOverrideFilter.Builder        builder = new StemmerOverrideFilter.Builder(false);
                CharArrayMap <string> .EntryIterator iter    = (CharArrayMap <string> .EntryIterator)stemOverrideDict.EntrySet().GetEnumerator();
                CharsRef spare = new CharsRef();
                while (iter.HasNext)
                {
                    char[] nextKey = iter.NextKey();
                    spare.CopyChars(nextKey, 0, nextKey.Length);
                    builder.Add(new string(spare.Chars), iter.CurrentValue);
                }
                try
                {
                    this.stemdict = builder.Build();
                }
                catch (IOException ex)
                {
                    throw new Exception("can not build stem dict", ex);
                }
            }
        }
示例#20
0
 public override void Reset()
 {
     input.Reset();
     buffer.Reset(input);
     replacement = null;
     inputOff    = 0;
 }
示例#21
0
        /// <summary>
        /// Decompress the <see cref="byte"/> array previously returned by
        /// <see cref="CompressString(string)"/> back into a <see cref="string"/>
        /// </summary>
        public static string DecompressString(byte[] value, int offset, int length)
        {
            byte[]   bytes  = Decompress(value, offset, length);
            CharsRef result = new CharsRef(bytes.Length);

            UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.Length, result);
            return(new string(result.Chars, 0, result.Length));
        }
示例#22
0
 public override void Reset()
 {
     // LUCENENET: reset the BufferedCharFilter.
     _input.Reset();
     buffer.Reset(_input);
     replacement = null;
     inputOff    = 0;
 }
        public override IBits ReadLiveDocs(Directory dir, SegmentCommitInfo info, IOContext context)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(info.HasDeletions);
            }
            var scratch      = new BytesRef();
            var scratchUtf16 = new CharsRef();

            var fileName             = IndexFileNames.FileNameFromGeneration(info.Info.Name, LIVEDOCS_EXTENSION, info.DelGen);
            ChecksumIndexInput input = null;
            var success = false;

            try
            {
                input = dir.OpenChecksumInput(fileName, context);

                SimpleTextUtil.ReadLine(input, scratch);
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(StringHelper.StartsWith(scratch, SIZE));
                }
                var size = ParseInt32At(scratch, SIZE.Length, scratchUtf16);

                var bits = new BitSet(size);

                SimpleTextUtil.ReadLine(input, scratch);
                while (!scratch.Equals(END))
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(StringHelper.StartsWith(scratch, DOC));
                    }
                    var docid = ParseInt32At(scratch, DOC.Length, scratchUtf16);
                    bits.Set(docid);
                    SimpleTextUtil.ReadLine(input, scratch);
                }

                SimpleTextUtil.CheckFooter(input);

                success = true;
                return(new SimpleTextBits(bits, size));
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(input);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(input);
                }
            }
        }
示例#24
0
            // NOTE: while it's tempting to make this public, since
            // caller's parser likely knows the
            // numInput/numOutputWords, sneaky exceptions, much later
            // on, will result if these values are wrong; so we always
            // recompute ourselves to be safe:
            internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig)
            {
                // first convert to UTF-8
                if (numInputWords <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(numInputWords), "numInputWords must be > 0 (got " + numInputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }
                if (input.Length <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(input.Length), "input.Length must be > 0 (got " + input.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }
                if (numOutputWords <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(numOutputWords), "numOutputWords must be > 0 (got " + numOutputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }
                if (output.Length <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(output.Length), "output.Length must be > 0 (got " + output.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(!HasHoles(input), "input has holes: {0}", input);
                    Debugging.Assert(!HasHoles(output), "output has holes: {0}", output);
                }

                //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
                UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch);
                // lookup in hash
                int ord = words.Add(utf8Scratch);

                if (ord < 0)
                {
                    // already exists in our hash
                    ord = (-ord) - 1;
                    //System.out.println("  output=" + output + " old ord=" + ord);
                }
                else
                {
                    //System.out.println("  output=" + output + " new ord=" + ord);
                }

                if (!workingSet.TryGetValue(input, out MapEntry e) || e is null)
                {
                    e = new MapEntry();
                    workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map
                }

                e.ords.Add(ord);
                e.includeOrig       |= includeOrig;
                maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords);
                maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords);
            }
示例#25
0
        public override bool IncrementToken()
        {
            if (buffer != null && buffer.Count > 0)
            {
                CharsRef nextStem = buffer[0];
                buffer.RemoveAt(0);
                RestoreState(savedState);
                posIncAtt.PositionIncrement = 0;
                termAtt.SetEmpty().Append(nextStem);
                return(true);
            }

            if (!m_input.IncrementToken())
            {
                return(false);
            }

            if (keywordAtt.IsKeyword)
            {
                return(true);
            }

            buffer = new JCG.List <CharsRef>(dedup ? stemmer.UniqueStems(termAtt.Buffer, termAtt.Length) : stemmer.Stem(termAtt.Buffer, termAtt.Length));

            if (buffer.Count == 0) // we do not know this word, return it unchanged
            {
                return(true);
            }

            if (longestOnly && buffer.Count > 1)
            {
                buffer.Sort(lengthComparer);
            }

            CharsRef stem = buffer[0];

            buffer.RemoveAt(0);
            termAtt.SetEmpty().Append(stem);

            if (longestOnly)
            {
                buffer.Clear();
            }
            else
            {
                if (buffer.Count > 0)
                {
                    savedState = CaptureState();
                }
            }

            return(true);
        }
示例#26
0
            // NOTE: while it's tempting to make this public, since
            // caller's parser likely knows the
            // numInput/numOutputWords, sneaky exceptions, much later
            // on, will result if these values are wrong; so we always
            // recompute ourselves to be safe:
            internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig)
            {
                // first convert to UTF-8
                if (numInputWords <= 0)
                {
                    throw new System.ArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
                }
                if (input.Length <= 0)
                {
                    throw new System.ArgumentException("input.length must be > 0 (got " + input.Length + ")");
                }
                if (numOutputWords <= 0)
                {
                    throw new System.ArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
                }
                if (output.Length <= 0)
                {
                    throw new System.ArgumentException("output.length must be > 0 (got " + output.Length + ")");
                }

                Debug.Assert(!HasHoles(input), "input has holes: " + input);
                Debug.Assert(!HasHoles(output), "output has holes: " + output);

                //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
                UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch);
                // lookup in hash
                int ord = words.Add(utf8Scratch);

                if (ord < 0)
                {
                    // already exists in our hash
                    ord = (-ord) - 1;
                    //System.out.println("  output=" + output + " old ord=" + ord);
                }
                else
                {
                    //System.out.println("  output=" + output + " new ord=" + ord);
                }

                MapEntry e = workingSet.ContainsKey(input) ? workingSet[input] : null;

                if (e == null)
                {
                    e = new MapEntry();
                    workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map
                }

                e.ords.Add(ord);
                e.includeOrig       |= includeOrig;
                maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords);
                maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords);
            }
示例#27
0
 public virtual CharsRef PullNext()
 {
     if (Debugging.AssertsEnabled) Debugging.Assert(upto < count);
     lastEndOffset = endOffsets[upto];
     lastPosLength = posLengths[upto];
     CharsRef result = outputs[upto++];
     posIncr = 0;
     if (upto == count)
     {
         Reset();
     }
     return result;
 }
示例#28
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (buffer != null && buffer.Count > 0)
            {
                CharsRef nextStem = buffer.Remove(0);
                restoreState(savedState);
                posIncAtt.PositionIncrement = 0;
                termAtt.setEmpty().append(nextStem);
                return(true);
            }

            if (!input.incrementToken())
            {
                return(false);
            }

            if (keywordAtt.Keyword)
            {
                return(true);
            }

            buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());

            if (buffer.Count == 0)     // we do not know this word, return it unchanged
            {
                return(true);
            }

            if (longestOnly && buffer.Count > 1)
            {
                buffer.Sort(lengthComparator);
            }

            CharsRef stem = buffer.Remove(0);

            termAtt.setEmpty().append(stem);

            if (longestOnly)
            {
                buffer.Clear();
            }
            else
            {
                if (buffer.Count > 0)
                {
                    savedState = captureState();
                }
            }

            return(true);
        }
示例#29
0
        private CharsRef ParseSynonym(string line, CharsRef reuse)
        {
            if (reuse == null)
            {
                reuse = new CharsRef(8);
            }

            int start = line.IndexOf('\'') + 1;
            int end   = line.LastIndexOf('\'');

            string text = line.Substring(start, end - start).Replace("''", "'");

            return(Analyze(text, reuse));
        }
示例#30
0
            internal virtual int CountWords(CharsRef chars)
            {
                int wordCount = 1;
                int upto      = chars.Offset;
                int limit     = chars.Offset + chars.Length;

                while (upto < limit)
                {
                    if (chars.Chars[upto++] == SynonymMap.WORD_SEPARATOR)
                    {
                        wordCount++;
                    }
                }
                return(wordCount);
            }
        private void Add(string input, string output, bool keepOrig)
        {
            if (VERBOSE)
            {
                Console.WriteLine("  add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
            }

            CharsRef inputCharsRef = new CharsRef();
            SynonymMap.Builder.Join(space.Split(input), inputCharsRef);

            CharsRef outputCharsRef = new CharsRef();
            SynonymMap.Builder.Join(space.Split(output), outputCharsRef);

            b.Add(inputCharsRef, outputCharsRef, keepOrig);
        }
示例#32
0
            /// <summary>
            /// Sugar: analyzes the text with the analyzer and
            /// separates by <see cref="SynonymMap.WORD_SEPARATOR"/>.
            /// reuse and its chars must not be null.
            /// </summary>
            public virtual CharsRef Analyze(string text, CharsRef reuse)
            {
                IOException priorException = null;
                TokenStream ts             = analyzer.GetTokenStream("", text);

                try
                {
                    var termAtt   = ts.AddAttribute <ICharTermAttribute>();
                    var posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>();
                    ts.Reset();
                    reuse.Length = 0;
                    while (ts.IncrementToken())
                    {
                        int length = termAtt.Length;
                        if (length == 0)
                        {
                            throw new System.ArgumentException("term: " + text + " analyzed to a zero-length token");
                        }
                        if (posIncAtt.PositionIncrement != 1)
                        {
                            throw new System.ArgumentException("term: " + text + " analyzed to a token with posinc != 1");
                        }
                        reuse.Grow(reuse.Length + length + 1); // current + word + separator
                        int end = reuse.Offset + reuse.Length;
                        if (reuse.Length > 0)
                        {
                            reuse.Chars[end++] = SynonymMap.WORD_SEPARATOR;
                            reuse.Length++;
                        }
                        Array.Copy(termAtt.Buffer, 0, reuse.Chars, end, length);
                        reuse.Length += length;
                    }
                    ts.End();
                }
                catch (IOException e)
                {
                    priorException = e;
                }
                finally
                {
                    IOUtils.CloseWhileHandlingException(priorException, ts);
                }
                if (reuse.Length == 0)
                {
                    throw new System.ArgumentException("term: " + text + " was completely eliminated by analyzer");
                }
                return(reuse);
            }
示例#33
0
        public override void Parse(TextReader @in)
        {
            int        lineNumber = 0;
            TextReader br         = @in;

            try
            {
                string     line         = null;
                string     lastSynSetID = "";
                CharsRef[] synset       = new CharsRef[8];
                int        synsetSize   = 0;


                while ((line = br.ReadLine()) != null)
                {
                    lineNumber++;
                    string synSetID = line.Substring(2, 9);

                    if (!synSetID.Equals(lastSynSetID, StringComparison.Ordinal))
                    {
                        AddInternal(synset, synsetSize);
                        synsetSize = 0;
                    }

                    if (synset.Length <= synsetSize + 1)
                    {
                        CharsRef[] larger = new CharsRef[synset.Length * 2];
                        Array.Copy(synset, 0, larger, 0, synsetSize);
                        synset = larger;
                    }

                    synset[synsetSize] = ParseSynonym(line, synset[synsetSize]);
                    synsetSize++;
                    lastSynSetID = synSetID;
                }

                // final synset in the file
                AddInternal(synset, synsetSize);
            }
            catch (ArgumentException e)
            {
                throw new Exception("Invalid synonym rule at line " + lineNumber.ToString(), e);
            }
            finally
            {
                br.Dispose();
            }
        }
示例#34
0
        public virtual void TestAppend()
        {
            CharsRef @ref = new CharsRef();
            StringBuilder builder = new StringBuilder();
            int numStrings = AtLeast(10);
            for (int i = 0; i < numStrings; i++)
            {
                char[] charArray = TestUtil.RandomRealisticUnicodeString(Random(), 1, 100).ToCharArray();
                int offset = Random().Next(charArray.Length);
                int length = charArray.Length - offset;
                builder.Append(charArray, offset, length);
                @ref.Append(charArray, offset, length);
            }

            Assert.AreEqual(builder.ToString(), @ref.ToString());
        }
示例#35
0
            public virtual CharsRef pullNext()
            {
                Debug.Assert(upto < count);
                lastEndOffset = endOffsets[upto];
                lastPosLength = posLengths[upto];
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.CharsRef result = outputs[upto++];
                CharsRef result = outputs[upto++];

                posIncr = 0;
                if (upto == count)
                {
                    reset();
                }
                return(result);
            }
        private void Add(string input, string output, bool keepOrig)
        {
            if (VERBOSE)
            {
                Console.WriteLine("  add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
            }
            CharsRef inputCharsRef = new CharsRef();

            SynonymMap.Builder.Join(input.Split(new string[] { " +" }, StringSplitOptions.RemoveEmptyEntries), inputCharsRef);

            CharsRef outputCharsRef = new CharsRef();

            SynonymMap.Builder.Join(output.Split(new string[] { " +" }, StringSplitOptions.RemoveEmptyEntries), outputCharsRef);

            b.Add(inputCharsRef, outputCharsRef, keepOrig);
        }
        public override void Parse(TextReader @in)
        {
            int lineNumber = 0;
            TextReader br = @in;
            try
            {
                string line = null;
                string lastSynSetID = "";
                CharsRef[] synset = new CharsRef[8];
                int synsetSize = 0;


                while ((line = br.ReadLine()) != null)
                {
                    lineNumber++;
                    string synSetID = line.Substring(2, 9);

                    if (!synSetID.Equals(lastSynSetID))
                    {
                        AddInternal(synset, synsetSize);
                        synsetSize = 0;
                    }

                    if (synset.Length <= synsetSize + 1)
                    {
                        CharsRef[] larger = new CharsRef[synset.Length * 2];
                        Array.Copy(synset, 0, larger, 0, synsetSize);
                        synset = larger;
                    }

                    synset[synsetSize] = ParseSynonym(line, synset[synsetSize]);
                    synsetSize++;
                    lastSynSetID = synSetID;
                }

                // final synset in the file
                AddInternal(synset, synsetSize);
            }
            catch (System.ArgumentException e)
            {
                throw new Exception("Invalid synonym rule at line " + lineNumber.ToString(), e);
            }
            finally
            {
                br.Dispose();
            }
        }
示例#38
0
	  public override void Parse(Reader @in)
	  {
		LineNumberReader br = new LineNumberReader(@in);
		try
		{
		  string line = null;
		  string lastSynSetID = "";
		  CharsRef[] synset = new CharsRef[8];
		  int synsetSize = 0;

		  while ((line = br.readLine()) != null)
		  {
			string synSetID = line.Substring(2, 9);

			if (!synSetID.Equals(lastSynSetID))
			{
			  addInternal(synset, synsetSize);
			  synsetSize = 0;
			}

			if (synset.Length <= synsetSize+1)
			{
			  CharsRef[] larger = new CharsRef[synset.Length * 2];
			  Array.Copy(synset, 0, larger, 0, synsetSize);
			  synset = larger;
			}

			synset[synsetSize] = parseSynonym(line, synset[synsetSize]);
			synsetSize++;
			lastSynSetID = synSetID;
		  }

		  // final synset in the file
		  addInternal(synset, synsetSize);
		}
		catch (System.ArgumentException e)
		{
		  ParseException ex = new ParseException("Invalid synonym rule at line " + br.LineNumber, 0);
		  ex.initCause(e);
		  throw ex;
		}
		finally
		{
		  br.close();
		}
	  }
示例#39
0
		// NOTE: while it's tempting to make this public, since
		// caller's parser likely knows the
		// numInput/numOutputWords, sneaky exceptions, much later
		// on, will result if these values are wrong; so we always
		// recompute ourselves to be safe:
		internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig)
		{
		  // first convert to UTF-8
		  if (numInputWords <= 0)
		  {
			throw new System.ArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
		  }
		  if (input.Length <= 0)
		  {
			throw new System.ArgumentException("input.length must be > 0 (got " + input.Length + ")");
		  }
		  if (numOutputWords <= 0)
		  {
			throw new System.ArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
		  }
		  if (output.Length <= 0)
		  {
			throw new System.ArgumentException("output.length must be > 0 (got " + output.Length + ")");
		  }

		  Debug.Assert(!HasHoles(input), "input has holes: " + input);
		  Debug.Assert(!HasHoles(output), "output has holes: " + output);

		  //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
		  UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch);
		  // lookup in hash
		  int ord = words.Add(utf8Scratch);
		  if (ord < 0)
		  {
			// already exists in our hash
			ord = (-ord) - 1;
			//System.out.println("  output=" + output + " old ord=" + ord);
		  }
		  else
		  {
			//System.out.println("  output=" + output + " new ord=" + ord);
		  }

		  MapEntry e = workingSet[input];
		  if (e == null)
		  {
			e = new MapEntry();
			workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map
		  }

		  e.ords.Add(ord);
		  e.includeOrig |= includeOrig;
		  maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords);
		  maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords);
		}
示例#40
0
        /// <summary>
        /// Encode characters from a char[] source, starting at
        ///  offset for length chars. After encoding, result.offset will always be 0.
        /// </summary>
        // TODO: broken if incoming result.offset != 0
        public static void UTF16toUTF8(CharsRef source, int offset, int length, BytesRef result)
        {
            int upto = 0;
            int i = offset;
            int end = offset + length;
            var @out = result.Bytes;
            // Pre-allocate for worst case 4-for-1
            int maxLen = length * 4;
            if (@out.Length < maxLen)
            {
                @out = result.Bytes = new byte[maxLen];
            }
            result.Offset = 0;

            while (i < end)
            {
                int code = (int)source.CharAt(i++);

                if (code < 0x80)
                {
                    @out[upto++] = (byte)code;
                }
                else if (code < 0x800)
                {
                    @out[upto++] = (byte)(0xC0 | (code >> 6));
                    @out[upto++] = (byte)(0x80 | (code & 0x3F));
                }
                else if (code < 0xD800 || code > 0xDFFF)
                {
                    @out[upto++] = (byte)(0xE0 | (code >> 12));
                    @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
                    @out[upto++] = (byte)(0x80 | (code & 0x3F));
                }
                else
                {
                    // surrogate pair
                    // confirm valid high surrogate
                    if (code < 0xDC00 && i < end)
                    {
                        var utf32 = (int)source.CharAt(i);
                        // confirm valid low surrogate and write pair
                        if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
                        {
                            utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
                            i++;
                            @out[upto++] = (byte)(0xF0 | (utf32 >> 18));
                            @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
                            @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
                            @out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
                            continue;
                        }
                    }
                    // replace unpaired surrogate or out-of-order low surrogate
                    // with substitution character
                    @out[upto++] = unchecked((byte)0xEF);
                    @out[upto++] = unchecked((byte)0xBF);
                    @out[upto++] = unchecked((byte)0xBD);
                }
            }
            //assert matches(source, offset, length, out, upto);
            result.Length = upto;
        }
        private void AddInternal(CharsRef[] synset, int size)
        {
            if (size <= 1)
            {
                return; // nothing to do
            }

            if (expand)
            {
                for (int i = 0; i < size; i++)
                {
                    for (int j = 0; j < size; j++)
                    {
                        Add(synset[i], synset[j], false);
                    }
                }
            }
            else
            {
                for (int i = 0; i < size; i++)
                {
                    Add(synset[i], synset[0], false);
                }
            }
        }
示例#42
0
 /// <summary>
 /// Interprets the given byte array as UTF-8 and converts to UTF-16. The <seealso cref="CharsRef"/> will be extended if
 /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
 /// <p>
 /// NOTE: Full characters are read, even if this reads past the length passed (and
 /// can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed).
 /// Explicit checks for valid UTF-8 are not performed.
 /// </summary>
 // TODO: broken if chars.offset != 0
 public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
 {
     int out_offset = chars.Offset = 0;
     char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length);
     int limit = offset + length;
     while (offset < limit)
     {
         int b = ((sbyte)utf8[offset++]) & 0xff;
         if (b < 0xc0)
         {
             Debug.Assert(b < 0x80);
             @out[out_offset++] = (char)b;
         }
         else if (b < 0xe0)
         {
             @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
         }
         else if (b < 0xf0)
         {
             @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
             offset += 2;
         }
         else
         {
             Debug.Assert(b < 0xf8, "b = 0x" + b.ToString("x"));
             int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
             offset += 3;
             if (ch < UNI_MAX_BMP)
             {
                 @out[out_offset++] = (char)ch;
             }
             else
             {
                 int chHalf = ch - 0x0010000;
                 @out[out_offset++] = (char)((chHalf >> 10) + 0xD800);
                 @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00);
             }
         }
     }
     chars.Length = out_offset - chars.Offset;
 }
        private CharsRef ParseSynonym(string line, CharsRef reuse)
        {
            if (reuse == null)
            {
                reuse = new CharsRef(8);
            }

            int start = line.IndexOf('\'') + 1;
            int end = line.LastIndexOf('\'');

            string text = line.Substring(start, end - start).Replace("''", "'");
            return Analyze(text, reuse);
        }
示例#44
0
 /// <summary>
 /// Utility method for <seealso cref="#UTF8toUTF16(byte[], int, int, CharsRef)"/> </summary>
 /// <seealso cref= #UTF8toUTF16(byte[], int, int, CharsRef) </seealso>
 public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
 {
     UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars);
 }
        public override List<LookupResult> DoLookup(string key, IEnumerable<BytesRef> contexts, bool higherWeightsFirst, int num)
        {
            if (contexts != null)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            IList<FSTCompletion.Completion> completions;
            if (higherWeightsFirst)
            {
                completions = higherWeightsCompletion.DoLookup(key, num);
            }
            else
            {
                completions = normalCompletion.DoLookup(key, num);
            }

            List<LookupResult> results = new List<LookupResult>(completions.Count);
            CharsRef spare = new CharsRef();
            foreach (FSTCompletion.Completion c in completions)
            {
                spare.Grow(c.utf8.Length);
                UnicodeUtil.UTF8toUTF16(c.utf8, spare);
                results.Add(new LookupResult(spare.ToString(), c.bucket));
            }
            return results;
        }
        public virtual void TestAllUnicodeChars()
        {
            BytesRef utf8 = new BytesRef(10);
            CharsRef utf16 = new CharsRef(10);
            char[] chars = new char[2];
            for (int ch = 0; ch < 0x0010FFFF; ch++)
            {
                if (ch == 0xd800)
                // Skip invalid code points
                {
                    ch = 0xe000;
                }

                int len = 0;
                if (ch <= 0xffff)
                {
                    chars[len++] = (char)ch;
                }
                else
                {
                    chars[len++] = (char)(((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
                    chars[len++] = (char)(((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
                }

                UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8);

                string s1 = new string(chars, 0, len);
                string s2 = Encoding.UTF8.GetString(utf8.Bytes, utf8.Offset, utf8.Length);
                Assert.AreEqual(s1, s2, "codepoint " + ch);

                UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16);
                Assert.AreEqual(s1, new string(utf16.Chars, 0, utf16.Length), "codepoint " + ch);

                var b = s1.GetBytes(Encoding.UTF8);
                Assert.AreEqual(utf8.Length, b.Length);
                for (int j = 0; j < utf8.Length; j++)
                {
                    Assert.AreEqual(utf8.Bytes[j], b[j]);
                }
            }
        }
示例#47
0
 public virtual void Add(char[] output, int offset, int len, int endOffset, int posLength)
 {
     if (count == outputs.Length)
     {
         CharsRef[] next = new CharsRef[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
         Array.Copy(outputs, 0, next, 0, count);
         outputs = next;
     }
     if (count == endOffsets.Length)
     {
         int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT)];
         Array.Copy(endOffsets, 0, next, 0, count);
         endOffsets = next;
     }
     if (count == posLengths.Length)
     {
         int[] next = new int[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_INT)];
         Array.Copy(posLengths, 0, next, 0, count);
         posLengths = next;
     }
     if (outputs[count] == null)
     {
         outputs[count] = new CharsRef();
     }
     outputs[count].CopyChars(output, offset, len);
     // endOffset can be -1, in which case we should simply
     // use the endOffset of the input token, or X >= 0, in
     // which case we use X as the endOffset for this output
     endOffsets[count] = endOffset;
     posLengths[count] = posLength;
     count++;
 }
        public override void Parse(TextReader @in)
        {
            int lineNumber = 0;
            try
            {
                string line = null;
                while ((line = @in.ReadLine()) != null)
                {
                    lineNumber++;
                    if (line.Length == 0 || line[0] == '#')
                    {
                        continue; // ignore empty lines and comments
                    }

                    CharsRef[] inputs;
                    CharsRef[] outputs;

                    // TODO: we could process this more efficiently.
                    string[] sides = Split(line, "=>");
                    if (sides.Length > 1) // explicit mapping
                    {
                        if (sides.Length != 2)
                        {
                            throw new System.ArgumentException("more than one explicit mapping specified on the same line");
                        }
                        string[] inputStrings = Split(sides[0], ",");
                        inputs = new CharsRef[inputStrings.Length];
                        for (int i = 0; i < inputs.Length; i++)
                        {
                            inputs[i] = Analyze(Unescape(inputStrings[i]).Trim(), new CharsRef());
                        }

                        string[] outputStrings = Split(sides[1], ",");
                        outputs = new CharsRef[outputStrings.Length];
                        for (int i = 0; i < outputs.Length; i++)
                        {
                            outputs[i] = Analyze(Unescape(outputStrings[i]).Trim(), new CharsRef());
                        }
                    }
                    else
                    {
                        string[] inputStrings = Split(line, ",");
                        inputs = new CharsRef[inputStrings.Length];
                        for (int i = 0; i < inputs.Length; i++)
                        {
                            inputs[i] = Analyze(Unescape(inputStrings[i]).Trim(), new CharsRef());
                        }
                        if (expand)
                        {
                            outputs = inputs;
                        }
                        else
                        {
                            outputs = new CharsRef[1];
                            outputs[0] = inputs[0];
                        }
                    }

                    // currently we include the term itself in the map,
                    // and use includeOrig = false always.
                    // this is how the existing filter does it, but its actually a bug,
                    // especially if combined with ignoreCase = true
                    for (int i = 0; i < inputs.Length; i++)
                    {
                        for (int j = 0; j < outputs.Length; j++)
                        {
                            Add(inputs[i], outputs[j], false);
                        }
                    }
                }
            }
            catch (System.ArgumentException e)
            {
                throw new Exception("Invalid synonym rule at line " + lineNumber, e);
                //ex.initCause(e);
                //throw ex;
            }
            finally
            {
                @in.Dispose();
            }
        }
示例#49
0
 /// <summary>
 /// Decompress the byte array previously returned by
 ///  compressString back into a String
 /// </summary>
 public static string DecompressString(byte[] value, int offset, int length)
 {
     byte[] bytes = Decompress(value, offset, length);
     CharsRef result = new CharsRef(bytes.Length);
     UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.Length, result);
     return new string(result.Chars, 0, result.Length);
 }
示例#50
0
		internal virtual int countWords(CharsRef chars)
		{
		  int wordCount = 1;
		  int upto = chars.Offset;
		  int limit = chars.Offset + chars.Length;
		  while (upto < limit)
		  {
			if (chars.Chars[upto++] == SynonymMap.WORD_SEPARATOR)
			{
			  wordCount++;
			}
		  }
		  return wordCount;
		}
示例#51
0
		/// <summary>
		/// Add a phrase->phrase synonym mapping.
		/// Phrases are character sequences where words are
		/// separated with character zero (U+0000).  Empty words
		/// (two U+0000s in a row) are not allowed in the input nor
		/// the output!
		/// </summary>
		/// <param name="input"> input phrase </param>
		/// <param name="output"> output phrase </param>
		/// <param name="includeOrig"> true if the original should be included </param>
		public virtual void Add(CharsRef input, CharsRef output, bool includeOrig)
		{
		  Add(input, countWords(input), output, countWords(output), includeOrig);
		}
        /// <summary>
        /// Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
        /// given selection of fields from terms with a document frequency greater than
        /// the given maxDocFreq
        /// </summary>
        /// <param name="matchVersion"> Version to be used in <seealso cref="StopFilter"/> </param>
        /// <param name="delegate"> Analyzer whose TokenStream will be filtered </param>
        /// <param name="indexReader"> IndexReader to identify the stopwords from </param>
        /// <param name="fields"> Selection of fields to calculate stopwords for </param>
        /// <param name="maxDocFreq"> Document frequency terms should be above in order to be stopwords </param>
        /// <exception cref="IOException"> Can be thrown while reading from the IndexReader </exception>
        public QueryAutoStopWordAnalyzer(LuceneVersion matchVersion, Analyzer @delegate, IndexReader indexReader, ICollection<string> fields, int maxDocFreq)
            : base(@delegate.Strategy)
        {
            this.matchVersion = matchVersion;
            this.@delegate = @delegate;

            foreach (string field in fields)
            {
                var stopWords = new HashSet<string>();
                Terms terms = MultiFields.GetTerms(indexReader, field);
                CharsRef spare = new CharsRef();
                if (terms != null)
                {
                    TermsEnum te = terms.Iterator(null);
                    BytesRef text;
                    while ((text = te.Next()) != null)
                    {
                        if (te.DocFreq() > maxDocFreq)
                        {
                            UnicodeUtil.UTF8toUTF16(text, spare);
                            stopWords.Add(spare.ToString());
                        }
                    }
                }
                stopWordsPerField[field] = stopWords;
            }
        }
示例#53
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private void addInternal(java.io.BufferedReader in) throws java.io.IOException
	  private void addInternal(BufferedReader @in)
	  {
		string line = null;
		while ((line = @in.readLine()) != null)
		{
		  if (line.Length == 0 || line[0] == '#')
		  {
			continue; // ignore empty lines and comments
		  }

		  CharsRef[] inputs;
		  CharsRef[] outputs;

		  // TODO: we could process this more efficiently.
		  string[] sides = Split(line, "=>");
		  if (sides.Length > 1) // explicit mapping
		  {
			if (sides.Length != 2)
			{
			  throw new System.ArgumentException("more than one explicit mapping specified on the same line");
			}
			string[] inputStrings = Split(sides[0], ",");
			inputs = new CharsRef[inputStrings.Length];
			for (int i = 0; i < inputs.Length; i++)
			{
			  inputs[i] = analyze(unescape(inputStrings[i]).Trim(), new CharsRef());
			}

			string[] outputStrings = Split(sides[1], ",");
			outputs = new CharsRef[outputStrings.Length];
			for (int i = 0; i < outputs.Length; i++)
			{
			  outputs[i] = analyze(unescape(outputStrings[i]).Trim(), new CharsRef());
			}
		  }
		  else
		  {
			string[] inputStrings = Split(line, ",");
			inputs = new CharsRef[inputStrings.Length];
			for (int i = 0; i < inputs.Length; i++)
			{
			  inputs[i] = analyze(unescape(inputStrings[i]).Trim(), new CharsRef());
			}
			if (expand)
			{
			  outputs = inputs;
			}
			else
			{
			  outputs = new CharsRef[1];
			  outputs[0] = inputs[0];
			}
		  }

		  // currently we include the term itself in the map,
		  // and use includeOrig = false always.
		  // this is how the existing filter does it, but its actually a bug,
		  // especially if combined with ignoreCase = true
		  for (int i = 0; i < inputs.Length; i++)
		  {
			for (int j = 0; j < outputs.Length; j++)
			{
			  add(inputs[i], outputs[j], false);
			}
		  }
		}
	  }
示例#54
0
		/// <summary>
		/// only used for asserting!
		/// </summary>
		internal virtual bool HasHoles(CharsRef chars)
		{
		  int end = chars.Offset + chars.Length;
		  for (int idx = chars.Offset + 1;idx < end;idx++)
		  {
			if (chars.Chars[idx] == SynonymMap.WORD_SEPARATOR && chars.Chars[idx - 1] == SynonymMap.WORD_SEPARATOR)
			{
			  return true;
			}
		  }
		  if (chars.Chars[chars.Offset] == '\u0000')
		  {
			return true;
		  }
		  if (chars.Chars[chars.Offset + chars.Length - 1] == '\u0000')
		  {
			return true;
		  }

		  return false;
		}
        public virtual void TestRandomUnicodeStrings()
        {
            char[] buffer = new char[20];
            char[] expected = new char[20];

            BytesRef utf8 = new BytesRef(20);
            CharsRef utf16 = new CharsRef(20);

            int num = AtLeast(100000);
            for (int iter = 0; iter < num; iter++)
            {
                bool hasIllegal = FillUnicode(buffer, expected, 0, 20);

                UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
                if (!hasIllegal)
                {
                    var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8);
                    Assert.AreEqual(b.Length, utf8.Length);
                    for (int i = 0; i < b.Length; i++)
                    {
                        Assert.AreEqual(b[i], utf8.Bytes[i]);
                    }
                }

                UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16);
                Assert.AreEqual(utf16.Length, 20);
                for (int i = 0; i < 20; i++)
                {
                    Assert.AreEqual(expected[i], utf16.Chars[i]);
                }
            }
        }
示例#56
0
 public virtual void TestUTF8UTF16CharsRef()
 {
     int num = AtLeast(3989);
     for (int i = 0; i < num; i++)
     {
         string unicode = TestUtil.RandomRealisticUnicodeString(Random());
         BytesRef @ref = new BytesRef(unicode);
         char[] arr = new char[1 + Random().Next(100)];
         int offset = Random().Next(arr.Length);
         int len = Random().Next(arr.Length - offset);
         CharsRef cRef = new CharsRef(arr, offset, len);
         UnicodeUtil.UTF8toUTF16(@ref, cRef);
         Assert.AreEqual(cRef.ToString(), unicode);
     }
 }
示例#57
0
        public override void Build(IInputIterator tfit)
        {
            if (tfit.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (tfit.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            root = new TernaryTreeNode();
            // buffer first
            #pragma warning disable 612, 618
            if (tfit.Comparator != BytesRef.UTF8SortedAsUTF16Comparer)
            {
                // make sure it's sorted and the comparator uses UTF16 sort order
                tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparer);
            }
            #pragma warning restore 612, 618

            List<string> tokens = new List<string>();
            List<object> vals = new List<object>(); // LUCENENET TODO: Should this be long? in Java it was Number, but we can probably do better than object
            BytesRef spare;
            CharsRef charsSpare = new CharsRef();
            while ((spare = tfit.Next()) != null)
            {
                charsSpare.Grow(spare.Length);
                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
                tokens.Add(charsSpare.ToString());
                vals.Add(Convert.ToInt64(tfit.Weight));
            }
            autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count - 1, root);
        }
示例#58
0
        public override int Read()
        {
            //System.out.println("\nread");
            while (true)
            {

              if (replacement != null && replacementPointer < replacement.Length)
              {
            //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
            return replacement.chars[replacement.offset + replacementPointer++];
              }

              // TODO: a more efficient approach would be Aho/Corasick's
              // algorithm
              // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
              // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
              //
              // I think this would be (almost?) equivalent to 1) adding
              // epsilon arcs from all final nodes back to the init
              // node in the FST, 2) adding a .* (skip any char)
              // loop on the initial node, and 3) determinizing
              // that.  Then we would not have to restart matching
              // at each position.

              int lastMatchLen = -1;
              CharsRef lastMatch = null;

              int firstCH = buffer.Get(inputOff);
              if (firstCH != -1)
              {
            FST.Arc<CharsRef> arc = cachedRootArcs[Convert.ToChar((char) firstCH)];
            if (arc != null)
            {
              if (!FST.TargetHasArcs(arc))
              {
                // Fast pass for single character match:
                Debug.Assert(arc.Final);
                lastMatchLen = 1;
                lastMatch = arc.Output;
              }
              else
              {
                int lookahead = 0;
                CharsRef output = arc.Output;
                while (true)
                {
                  lookahead++;

                  if (arc.Final)
                  {
                    // Match! (to node is final)
                    lastMatchLen = lookahead;
                    lastMatch = outputs.Add(output, arc.NextFinalOutput);
                    // Greedy: keep searching to see if there's a
                    // longer match...
                  }

                  if (!FST.TargetHasArcs(arc))
                  {
                    break;
                  }

                  int ch = buffer.Get(inputOff + lookahead);
                  if (ch == -1)
                  {
                    break;
                  }
                  if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) == null)
                  {
                    // Dead end
                    break;
                  }
                  output = outputs.Add(output, arc.Output);
                }
              }
            }
              }

              if (lastMatch != null)
              {
            inputOff += lastMatchLen;
            //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);
            int diff = lastMatchLen - lastMatch.Length;

            if (diff != 0)
            {
              int prevCumulativeDiff = LastCumulativeDiff;
              if (diff > 0)
              {
                // Replacement is shorter than matched input:
                AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
              }
              else
              {
                // Replacement is longer than matched input: remap
                // the "extra" chars all back to the same input
                // offset:
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final int outputStart = inputOff - prevCumulativeDiff;
                int outputStart = inputOff - prevCumulativeDiff;
                for (int extraIDX = 0;extraIDX < -diff;extraIDX++)
                {
                  AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
                }
              }
            }

            replacement = lastMatch;
            replacementPointer = 0;

              }
              else
              {
            int ret = buffer.Get(inputOff);
            if (ret != -1)
            {
              inputOff++;
              buffer.FreeBefore(inputOff);
            }
            return ret;
              }
            }
        }
示例#59
0
		/// <summary>
		/// Sugar: analyzes the text with the analyzer and
		///  separates by <seealso cref="SynonymMap#WORD_SEPARATOR"/>.
		///  reuse and its chars must not be null. 
		/// </summary>
		public virtual CharsRef Analyze(string text, CharsRef reuse)
		{
		  IOException priorException = null;
		  TokenStream ts = analyzer.TokenStream("", text);
		  try
		  {
              var termAtt = ts.AddAttribute < ICharTermAttribute>();
              var posIncAtt = ts.AddAttribute < IPositionIncrementAttribute>();
			ts.Reset();
			reuse.Length = 0;
			while (ts.IncrementToken())
			{
			  int length = termAtt.Length;
			  if (length == 0)
			  {
				throw new System.ArgumentException("term: " + text + " analyzed to a zero-length token");
			  }
			  if (posIncAtt.PositionIncrement != 1)
			  {
				throw new System.ArgumentException("term: " + text + " analyzed to a token with posinc != 1");
			  }
			  reuse.Grow(reuse.Length + length + 1); // current + word + separator
			  int end = reuse.Offset + reuse.Length;
			  if (reuse.Length > 0)
			  {
				reuse.Chars[end++] = SynonymMap.WORD_SEPARATOR;
				reuse.Length++;
			  }
			  Array.Copy(termAtt.Buffer(), 0, reuse.Chars, end, length);
			  reuse.Length += length;
			}
			ts.End();
		  }
		  catch (IOException e)
		  {
			priorException = e;
		  }
		  finally
		  {
			IOUtils.CloseWhileHandlingException(priorException, ts);
		  }
		  if (reuse.Length == 0)
		  {
			throw new System.ArgumentException("term: " + text + " was completely eliminated by analyzer");
		  }
		  return reuse;
		}
示例#60
0
		/// <summary>
		/// Sugar: just joins the provided terms with {@link
		///  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
		///  must not be null. 
		/// </summary>
		public static CharsRef join(string[] words, CharsRef reuse)
		{
		  int upto = 0;
		  char[] buffer = reuse.Chars;
		  foreach (string word in words)
		  {
			int wordLen = word.Length;
			int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
			if (needed > buffer.Length)
			{
			  reuse.Grow(needed);
			  buffer = reuse.Chars;
			}
			if (upto > 0)
			{
			  buffer[upto++] = SynonymMap.WORD_SEPARATOR;
			}

			word.CopyTo(0, buffer, upto, wordLen - 0);
			upto += wordLen;
		  }
		  reuse.Length = upto;
		  return reuse;
		}