Example #1
0
        public void TestOffsetsWithTokenizer()
        {
            const string input = @"test1 <a href=""foo"">testlink</a> test2 test3";

            Tokenizer t = new WhitespaceTokenizer(new HTMLStripCharFilter(CharReader.Get(new StringReader(input))));

            string          token   = string.Empty;
            List <Token>    results = new List <Token>();
            OffsetAttribute att     = ((OffsetAttribute)t.GetAttribute(typeof(OffsetAttribute)));

            t.IncrementToken();
            Assert.AreEqual(0, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(20, att.StartOffset());
            Assert.AreEqual(8, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(33, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(39, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());
        }
Example #2
0
 public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
 {
     this.outerInstance = outerInstance;
     this.attSource     = attSource;
     this.termAtt       = attSource.GetAttribute(typeof(CharTermAttribute));
     this.offsetAtt     = attSource.GetAttribute(typeof(OffsetAttribute));
 }
Example #3
0
 public SingleCharTokenizer(TokenStream input) : base(input)
 {
     _input                      = input;
     _termAttribute              = (TermAttribute)AddAttribute(typeof(TermAttribute));
     _offsetAttribute            = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
     _positionIncrementAttribute = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
 }
Example #4
0
        public T ReadMetadata <T>()
        {
            using (var reader = CreateReader())
            {
                var header = new MetadataHeader(reader); //MetadataHeader self registers to reader

                using (var vreader = (DependencyReader)reader.CreateVirtualReader(header.Header.HeaderSize))
                {
                    var mainBlock = header.StructureDefinitions.First(s => s.Type == StructureType.Main).TargetIndex;

                    vreader.Seek(header.DataBlocks[mainBlock].Offset, SeekOrigin.Begin);
                    var result = vreader.ReadObject <T>();

                    var blockProps = typeof(T).GetProperties()
                                     .Where(p => p.PropertyType.IsGenericType && p.PropertyType.GetGenericTypeDefinition() == typeof(BlockCollection <>));

                    foreach (var prop in blockProps)
                    {
                        var collection = prop.GetValue(result) as IBlockCollection;
                        var offset     = OffsetAttribute.ValueFor(prop);
                        collection.LoadBlocks(mainBlock, offset, vreader);
                    }

                    return(result);
                }
            }
        }
Example #5
0
        // we only check a few core attributes here.
        // TODO: test other things
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void assertEquals(String s, org.apache.lucene.analysis.TokenStream left, org.apache.lucene.analysis.TokenStream right) throws Exception
        public virtual void assertEquals(string s, TokenStream left, TokenStream right)
        {
            left.reset();
            right.reset();
            CharTermAttribute          leftTerm    = left.addAttribute(typeof(CharTermAttribute));
            CharTermAttribute          rightTerm   = right.addAttribute(typeof(CharTermAttribute));
            OffsetAttribute            leftOffset  = left.addAttribute(typeof(OffsetAttribute));
            OffsetAttribute            rightOffset = right.addAttribute(typeof(OffsetAttribute));
            PositionIncrementAttribute leftPos     = left.addAttribute(typeof(PositionIncrementAttribute));
            PositionIncrementAttribute rightPos    = right.addAttribute(typeof(PositionIncrementAttribute));

            while (left.incrementToken())
            {
                assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
                assertEquals("wrong term text for input: " + s, leftTerm.ToString(), rightTerm.ToString());
                assertEquals("wrong position for input: " + s, leftPos.PositionIncrement, rightPos.PositionIncrement);
                assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
                assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
            }
            ;
            assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
            left.end();
            right.end();
            assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
            left.close();
            right.close();
        }
        public LuceneTokenizerAdapter(Tokenizer tokenizer)
        {
            this.tokenizer = tokenizer;

            this.termAttr   = (CharTermAttribute)this.tokenizer.GetAttribute <ICharTermAttribute>();
            this.offsetAttr = (OffsetAttribute)this.tokenizer.GetAttribute <IOffsetAttribute>();
        }
Example #7
0
        // LUCENE-3642: normalize BMP->SMP and check that offsets are correct
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testCrossPlaneNormalization2() throws java.io.IOException
        public virtual void testCrossPlaneNormalization2()
        {
            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this);
            int      num      = 1000 * RANDOM_MULTIPLIER;

            for (int i = 0; i < num; i++)
            {
                string      s  = TestUtil.randomUnicodeString(random());
                TokenStream ts = analyzer.tokenStream("foo", s);
                try
                {
                    ts.reset();
                    OffsetAttribute offsetAtt = ts.addAttribute(typeof(OffsetAttribute));
                    while (ts.incrementToken())
                    {
                        string highlightedText = StringHelperClass.SubstringSpecial(s, offsetAtt.startOffset(), offsetAtt.endOffset());
                        for (int j = 0, cp = 0; j < highlightedText.Length; j += char.charCount(cp))
                        {
                            cp = char.ConvertToUtf32(highlightedText, j);
                            assertTrue("non-letter:" + cp.ToString("x"), char.IsLetter(cp));
                        }
                    }
                    ts.end();
                }
                finally
                {
                    IOUtils.closeWhileHandlingException(ts);
                }
            }
            // just for fun
            checkRandomData(random(), analyzer, num);
        }
Example #8
0
 public BoostingTokenFilter(TokenStream input, int[] startOffsets, float[] boosts)
     : base(input)
 {
     _startOffsets = startOffsets;
     _boosts       = boosts;
     _offsetAttr   = (OffsetAttribute)base.addAttribute(typeof(OffsetAttribute));
     _payloadAttr  = (PayloadAttribute)base.addAttribute(typeof(PayloadAttribute));
 }
Example #9
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
        public override void reset()
        {
            base.reset();
            hasMoreTokensInClone = false;
            clonedToken          = null;
            clonedTermAtt        = null;
            clonedOffsetAtt      = null;
        }
Example #10
0
        public void TestOffsetValueExpression()
        {
            Assert.AreEqual(0, OffsetAttribute.ValueFor((DataClass01 obj) => obj.Property1));

            Assert.AreEqual(10, OffsetAttribute.ValueFor((DataClass01 obj) => obj.Property2, 15));
            Assert.AreEqual(15, OffsetAttribute.ValueFor((DataClass01 obj) => obj.Property2, 25));

            Assert.AreEqual(20, OffsetAttribute.ValueFor((DataClass01 obj) => obj.Property3, 15));
        }
Example #11
0
        private static IntPtr ParseField(FieldInfo field, PatternFinder pf)
        {
            OffsetAttribute offset = (OffsetAttribute)Attribute.GetCustomAttributes(field, typeof(OffsetAttribute))
                                     .FirstOrDefault();
            OffsetValueCN valcn = (OffsetValueCN)Attribute.GetCustomAttributes(field, typeof(OffsetValueCN))
                                  .FirstOrDefault();
            OffsetValueNA valna = (OffsetValueNA)Attribute.GetCustomAttributes(field, typeof(OffsetValueNA))
                                  .FirstOrDefault();

            IntPtr result = IntPtr.Zero;

            if (Constants.Lang == Language.Chn)
            {
                if (valcn != null)
                {
                    return((IntPtr)valcn.Value);
                }

                if (offset == null)
                {
                    return(IntPtr.Zero);
                }

                bool     b1      = true;
                IntPtr[] results = pf.FindMany(offset.PatternCN, ref b1);
                if (results != null)
                {
                    result = results[0];
                }
            }
            else
            {
                if (valna != null)
                {
                    return((IntPtr)valna.Value);
                }

                if (offset == null)
                {
                    return(IntPtr.Zero);
                }

                bool     b1      = true;
                IntPtr[] results = pf.FindMany(offset.Pattern, ref b1);
                if (results != null)
                {
                    result = results[0];
                }
            }

            Logger.Info("[OffsetManager][{0:,27}] {1}", field.Name, result.ToString("X"));

            return(result);
        }
 public void SetTokenStream(TokenStream ts)
 {
     this.ts = ts;
     if (this.ts.HasAttribute <IOffsetAttribute>())
     {
         this.offsetAttr = (OffsetAttribute)this.ts.GetAttribute <IOffsetAttribute>();
     }
     if (this.ts.HasAttribute <ICharTermAttribute>())
     {
         this.termAttr = (CharTermAttribute)this.ts.GetAttribute <ICharTermAttribute>();
     }
 }
Example #13
0
        private void Init(int gramSize)
        {
            if (gramSize < 1)
            {
                throw new ArgumentException(
                          "minGram must be greater than zero");
            }
            _mGramSize = gramSize;

            _mTermAtt   = (TermAttribute)AddAttribute(typeof(TermAttribute));
            _mOffsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
        }
Example #14
0
        public IntMetaDataTokenStream(string tokenText)
        {
            _tokenText = tokenText;

            // NOTE: Calling the AddAttribute<T> method failed, so
            // switched to using AddAttributeImpl.
            _termAttribute   = new TermAttribute();
            _offsetAttribute = new OffsetAttribute();
            _payloadAtt      = new PayloadAttribute();
            base.AddAttributeImpl(_termAttribute);
            base.AddAttributeImpl(_offsetAttribute);
            base.AddAttributeImpl(_payloadAtt);
        }
Example #15
0
        public void TestOffsetValue()
        {
            var prop1 = typeof(DataClass01).GetProperty(nameof(DataClass01.Property1));
            var prop2 = typeof(DataClass01).GetProperty(nameof(DataClass01.Property2));
            var prop3 = typeof(DataClass01).GetProperty(nameof(DataClass01.Property3));

            Assert.AreEqual(0, OffsetAttribute.ValueFor(prop1));

            Assert.AreEqual(10, OffsetAttribute.ValueFor(prop2, 15));
            Assert.AreEqual(15, OffsetAttribute.ValueFor(prop2, 25));

            Assert.AreEqual(20, OffsetAttribute.ValueFor(prop3, 15));
        }
Example #16
0
            /**
             * Creates NGramTokenFilter with given min and max n-grams.
             * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
             * <param name="minGram">the smallest n-gram to generate</param>
             * <param name="maxGram">the largest n-gram to generate</param>
             */
            public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
                : base(input)
            {
                if (minGram < 1)
                {
                    throw new System.ArgumentException("minGram must be greater than zero");
                }
                if (minGram > maxGram)
                {
                    throw new System.ArgumentException("minGram must not be greater than maxGram");
                }
                this.minGram = minGram;
                this.maxGram = maxGram;

                this.termAtt   = (TermAttribute)AddAttribute(typeof(TermAttribute));
                this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
            }
Example #17
0
        /**
         * Creates NGramTokenFilter with given min and max n-grams.
         * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param>
         * <param name="minGram">the smallest n-gram to generate</param>
         * <param name="maxGram">the largest n-gram to generate</param>
         */
        public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
            : base(input)
        {
            if (minGram < 1)
            {
                throw new System.ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new System.ArgumentException("minGram must not be greater than maxGram");
            }
            _minGram = minGram;
            _maxGram = maxGram;

            _termAtt   = (TermAttribute)AddAttribute <ITermAttribute>();
            _offsetAtt = (OffsetAttribute)AddAttribute <IOffsetAttribute>();
        }
Example #18
0
        /**
         * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
         *
         * @param input {@link TokenStream} holding the input to be tokenized
         * @param side the {@link Side} from which to chop off an n-gram
         * @param minGram the smallest n-gram to generate
         * @param maxGram the largest n-gram to generate
         */
        public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            if (minGram < 1)
            {
                throw new IllegalArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new IllegalArgumentException("minGram must not be greater than maxGram");
            }

            this.minGram = minGram;
            this.maxGram = maxGram;
            this.side    = side;
            termAtt      = (TermAttribute)addAttribute(typeof(TermAttribute));
            offsetAtt    = (OffsetAttribute)addAttribute(typeof(OffsetAttribute));
        }
Example #19
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testSupplementaryCharacters() throws java.io.IOException
        public virtual void testSupplementaryCharacters()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final String s = org.apache.lucene.util.TestUtil.randomUnicodeString(random(), 10);
            string s = TestUtil.randomUnicodeString(random(), 10);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int codePointCount = s.codePointCount(0, s.length());
            int codePointCount = s.codePointCount(0, s.Length);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int minGram = org.apache.lucene.util.TestUtil.nextInt(random(), 1, 3);
            int minGram = TestUtil.Next(random(), 1, 3);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int maxGram = org.apache.lucene.util.TestUtil.nextInt(random(), minGram, 10);
            int         maxGram = TestUtil.Next(random(), minGram, 10);
            TokenStream tk      = new KeywordTokenizer(new StringReader(s));

            tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
            CharTermAttribute termAtt = tk.addAttribute(typeof(CharTermAttribute));
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsetAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
            OffsetAttribute offsetAtt = tk.addAttribute(typeof(OffsetAttribute));

            tk.reset();
            for (int start = 0; start < codePointCount; ++start)
            {
                for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end)
                {
                    assertTrue(tk.incrementToken());
                    assertEquals(0, offsetAtt.startOffset());
                    assertEquals(s.Length, offsetAtt.endOffset());
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int startIndex = Character.offsetByCodePoints(s, 0, start);
                    int startIndex = char.offsetByCodePoints(s, 0, start);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int endIndex = Character.offsetByCodePoints(s, 0, end);
                    int endIndex = char.offsetByCodePoints(s, 0, end);
                    assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString());
                }
            }
            assertFalse(tk.incrementToken());
        }
Example #20
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testOtherLetterOffset() throws java.io.IOException
        public virtual void testOtherLetterOffset()
        {
            string           s         = "a天b";
            ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));

            int             correctStartOffset = 0;
            int             correctEndOffset   = 1;
            OffsetAttribute offsetAtt          = tokenizer.getAttribute(typeof(OffsetAttribute));

            tokenizer.reset();
            while (tokenizer.incrementToken())
            {
                assertEquals(correctStartOffset, offsetAtt.startOffset());
                assertEquals(correctEndOffset, offsetAtt.endOffset());
                correctStartOffset++;
                correctEndOffset++;
            }
            tokenizer.end();
            tokenizer.close();
        }
Example #21
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testFilterTokens() throws Exception
        public virtual void testFilterTokens()
        {
            SnowballFilter             filter     = new SnowballFilter(new TestTokenStream(this), "English");
            CharTermAttribute          termAtt    = filter.getAttribute(typeof(CharTermAttribute));
            OffsetAttribute            offsetAtt  = filter.getAttribute(typeof(OffsetAttribute));
            TypeAttribute              typeAtt    = filter.getAttribute(typeof(TypeAttribute));
            PayloadAttribute           payloadAtt = filter.getAttribute(typeof(PayloadAttribute));
            PositionIncrementAttribute posIncAtt  = filter.getAttribute(typeof(PositionIncrementAttribute));
            FlagsAttribute             flagsAtt   = filter.getAttribute(typeof(FlagsAttribute));

            filter.incrementToken();

            assertEquals("accent", termAtt.ToString());
            assertEquals(2, offsetAtt.startOffset());
            assertEquals(7, offsetAtt.endOffset());
            assertEquals("wrd", typeAtt.type());
            assertEquals(3, posIncAtt.PositionIncrement);
            assertEquals(77, flagsAtt.Flags);
            assertEquals(new BytesRef(new sbyte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
Example #22
0
        internal void LoadBlocks(int currentBlock, long collectionOffset, DependencyReader reader)
        {
            if (blockCount == 0)
            {
                return;
            }

            var structdef = metadata.StructureDefinitions.First(s => s.FieldBlock == currentBlock && s.FieldOffset == collectionOffset);

            if (structdef.TargetIndex < 0)
            {
                return;
            }

            var block = metadata.DataBlocks[structdef.TargetIndex];

            var blockSize = FixedSizeAttribute.ValueFor(typeof(T));

            reader.Seek(block.Offset, SeekOrigin.Begin);
            for (int i = 0; i < blockCount; i++)
            {
                Add(reader.ReadObject <T>());
            }

            var blockProps = typeof(T).GetProperties()
                             .Where(p => p.PropertyType.IsGenericType && p.PropertyType.GetGenericTypeDefinition() == typeof(BlockCollection <>));

            int index = 0;

            foreach (var item in this)
            {
                var adjustedBase = blockSize * index++;
                foreach (var prop in blockProps)
                {
                    var collection = prop.GetValue(item) as IBlockCollection;
                    var offset     = OffsetAttribute.ValueFor(prop);
                    collection.LoadBlocks(structdef.TargetIndex, adjustedBase + offset, reader);
                }
            }
        }
Example #23
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = nextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null;

                if (result == null)
                {
                    copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <>();

                result = match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource            origTok        = includeOrig ? firstTok : null;
                PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute));
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.synonyms.Length; i++)
                {
                    Token                      repTok       = result.synonyms[i];
                    AttributeSource            newTok       = firstTok.cloneAttributes();
                    CharTermAttribute          newTermAtt   = newTok.addAttribute(typeof(CharTermAttribute));
                    OffsetAttribute            newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute));
                    PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute));

                    OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute));

                    newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
                    newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
                    repPos += repTok.PositionIncrement;
                    if (i == 0)     // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos    += origPosInc.PositionIncrement;
                        origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (origTok != null)
                        {
                            origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos    += origPosInc.PositionIncrement;
                    origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                    if (origTok != null)
                    {
                        origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
Example #24
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (hasMoreTokensInClone)
            {
                int start = breaker.current();
                int end   = breaker.next();
                if (end != BreakIterator.DONE)
                {
                    clonedToken.copyTo(this);
                    termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
                    if (hasIllegalOffsets)
                    {
                        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
                    }
                    else
                    {
                        offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
                    }
                    if (handlePosIncr)
                    {
                        posAtt.PositionIncrement = 1;
                    }
                    return(true);
                }
                hasMoreTokensInClone = false;
            }

            if (!input.incrementToken())
            {
                return(false);
            }

            if (termAtt.length() == 0 || char.UnicodeBlock.of(termAtt.charAt(0)) != char.UnicodeBlock.THAI)
            {
                return(true);
            }

            hasMoreTokensInClone = true;

            // if length by start + end offsets doesn't match the term text then assume
            // this is a synonym and don't adjust the offsets.
            hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();

            // we lazy init the cloned token, as in ctor not all attributes may be added
            if (clonedToken == null)
            {
                clonedToken     = cloneAttributes();
                clonedTermAtt   = clonedToken.getAttribute(typeof(CharTermAttribute));
                clonedOffsetAtt = clonedToken.getAttribute(typeof(OffsetAttribute));
            }
            else
            {
                this.copyTo(clonedToken);
            }

            // reinit CharacterIterator
            charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
            breaker.Text = charIterator;
            int end = breaker.next();

            if (end != BreakIterator.DONE)
            {
                termAtt.Length = end;
                if (hasIllegalOffsets)
                {
                    offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
                }
                else
                {
                    offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
                }
                // position increment keeps as it is for first token
                return(true);
            }
            return(false);
        }
            public override void CopyTo(IAttribute target)
            {
                OffsetAttribute t = (OffsetAttribute)target;

                t.SetOffset(start, end);
            }
Example #26
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.TokenStream("", key.ToString());

            try
            {
                TermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <TermToBytesRefAttribute>();
                OffsetAttribute            offsetAtt    = ts.AddAttribute <OffsetAttribute>();
                PositionLengthAttribute    posLenAtt    = ts.AddAttribute <PositionLengthAttribute>();
                PositionIncrementAttribute posIncAtt    = ts.AddAttribute <PositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset());
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.BytesReader;

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                IList <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    nextCompletionBreak :
                    backoff *= ALPHA;
                }

                results.Sort(new ComparatorAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(ts);
            }
        }
        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset)
        {
            Assert.IsNotNull(output);
            CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute)ts.AddAttribute(typeof(CheckClearAttributesAttribute));

            Assert.IsTrue(ts.HasAttribute(typeof(TermAttribute)), "has no TermAttribute");
            TermAttribute termAtt = (TermAttribute)ts.GetAttribute(typeof(TermAttribute));

            OffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(OffsetAttribute)), "has no OffsetAttribute");
                offsetAtt = (OffsetAttribute)ts.GetAttribute(typeof(OffsetAttribute));
            }

            TypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(TypeAttribute)), "has no TypeAttribute");
                typeAtt = (TypeAttribute)ts.GetAttribute(typeof(TypeAttribute));
            }

            PositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(PositionIncrementAttribute)), "has no PositionIncrementAttribute");
                posIncrAtt = (PositionIncrementAttribute)ts.GetAttribute(typeof(PositionIncrementAttribute));
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.SetType("bogusType");
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.SetPositionIncrement(45987657);
                }

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term(), "term " + i);
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type(), "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i);
                }
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
            {
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset(), "finalOffset ");
            }
            ts.Close();
        }
Example #28
0
        private void btnAnalyze_Click(object sender, System.EventArgs e)
        {
            ShowError("");
            try
            {
                if (cmbAnalyzers.SelectedItem == null)
                {
                    return;
                }

                Analyzer analyzer = null;
                try
                {
                    // Trying to create type from executing assembly
                    Type analyzerType = (Type)analyzers[cmbAnalyzers.SelectedItem];

                    if (null == analyzerType)
                    {
                        // Trying to create type from Lucene.Net assembly
                        Assembly a = Assembly.GetAssembly(typeof(Lucene.Net.Analysis.Analyzer));
                        analyzerType = a.GetType((string)cmbAnalyzers.SelectedItem);
                    }

                    // Trying to create with default constructor
                    analyzer = (Analyzer)Activator.CreateInstance(analyzerType, Util.Version.LUCENE_30);
                }
                catch (Exception)
                {}

                if (null == analyzer)
                {
                    ShowError("Couldn't instantiate analyzer - public zero-argument constructor required");
                    return;
                }

                txtOutput.Text = txtText.Text;

                lstResults.BeginUpdate();
                try
                {
                    TokenStream ts     = analyzer.TokenStream("Analyze", new StringReader(txtText.Text));
                    var         token  = ts.GetAttribute <ITermAttribute>();
                    var         offset = ts.GetAttribute <IOffsetAttribute>();

                    lstResults.Items.Clear();
                    tokens.Clear();

                    while (ts.IncrementToken())
                    {
                        lstResults.Items.Add(token.Term);
                        var a = new OffsetAttribute();
                        a.SetOffset(offset.StartOffset, offset.EndOffset);
                        tokens.Add(a);
                    }
                }
                finally
                {
                    lstResults.EndUpdate();
                }
            }
            catch (Exception exc)
            {
                ShowError("Error analyzing: " + exc.Message);
            }
        }
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testBasic() throws Exception
        public virtual void testBasic()
        {
            b = new SynonymMap.Builder(true);
            add("a", "foo", true);
            add("a b", "bar fee", true);
            add("b c", "dog collar", true);
            add("c d", "dog harness holder extras", true);
            add("m c e", "dog barks loudly", false);
            add("i j k", "feep", true);

            add("e f", "foo bar", false);
            add("e f", "baz bee", false);

            add("z", "boo", false);
            add("y", "bee", true);

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.reset();
            assertTrue(tokensIn.incrementToken());
            assertFalse(tokensIn.incrementToken());
            tokensIn.end();
            tokensIn.close();

            tokensOut = new SynonymFilter(tokensIn, b.build(), true);
            termAtt = tokensOut.addAttribute(typeof(CharTermAttribute));
            posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute));
            posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute));
            offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute));

            verify("a b c", "a/bar b/fee c");

            // syn output extends beyond input tokens
            verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras");

            verify("a b a", "a/bar b/fee a/foo");

            // outputs that add to one another:
            verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras");

            // two outputs for same input
            verify("e f", "foo/baz bar/bee");

            // verify multi-word / single-output offsets:
            verify("g i j k g", "g i/feep:7_3 j k g");

            // mixed keepOrig true/false:
            verify("a m c e x", "a/foo dog barks loudly x");
            verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
            assertTrue(tokensOut.CaptureCount > 0);

            // no captureStates when no syns matched
            verify("p q r s t", "p q r s t");
            assertEquals(0, tokensOut.CaptureCount);

            // no captureStates when only single-input syns, w/ no
            // lookahead needed, matched
            verify("p q z y t", "p q boo y/bee t");
            assertEquals(0, tokensOut.CaptureCount);
        }
Example #30
0
        /// <summary>
        /// <para>Get the next token from the input stream.
        /// </para>
        /// <para>If the next token has <code>positionIncrement > 1</code>,
        /// <code>positionIncrement - 1</code> <seealso cref="#fillerToken"/>s are
        /// inserted first.
        /// </para>
        /// </summary>
        /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
        /// <returns> On success, the populated token; null otherwise </returns>
        /// <exception cref="IOException"> if the input stream has a problem </exception>
        private InputWindowToken getNextToken(InputWindowToken target)
        {
            InputWindowToken newTarget = target;

            if (numFillerTokensToInsert > 0)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
                }
                else
                {
                    nextInputStreamToken.copyTo(target.attSource);
                }
                // A filler token occupies no space
                newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
                newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
                newTarget.isFiller = true;
                --numFillerTokensToInsert;
            }
            else if (isNextInputStreamToken)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
                }
                else
                {
                    nextInputStreamToken.copyTo(target.attSource);
                }
                isNextInputStreamToken = false;
                newTarget.isFiller     = false;
            }
            else if (!exhausted)
            {
                if (input.incrementToken())
                {
                    if (null == target)
                    {
                        newTarget = new InputWindowToken(this, cloneAttributes());
                    }
                    else
                    {
                        this.copyTo(target.attSource);
                    }
                    if (posIncrAtt.PositionIncrement > 1)
                    {
                        // Each output shingle must contain at least one input token,
                        // so no more than (maxShingleSize - 1) filler tokens will be inserted.
                        numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
                        // Save the current token as the next input stream token
                        if (null == nextInputStreamToken)
                        {
                            nextInputStreamToken = cloneAttributes();
                        }
                        else
                        {
                            this.copyTo(nextInputStreamToken);
                        }
                        isNextInputStreamToken = true;
                        // A filler token occupies no space
                        newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
                        newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
                        newTarget.isFiller = true;
                        --numFillerTokensToInsert;
                    }
                    else
                    {
                        newTarget.isFiller = false;
                    }
                }
                else
                {
                    exhausted = true;
                    input.end();
                    endState = captureState();
                    numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
                    if (numFillerTokensToInsert > 0)
                    {
                        nextInputStreamToken = new AttributeSource(AttributeFactory);
                        nextInputStreamToken.addAttribute(typeof(CharTermAttribute));
                        OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(typeof(OffsetAttribute));
                        newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
                        // Recurse/loop just once:
                        return(getNextToken(target));
                    }
                    else
                    {
                        newTarget = null;
                    }
                }
            }
            else
            {
                newTarget = null;
            }
            return(newTarget);
        }
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testRandom() throws Exception
        public virtual void testRandom()
        {
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final int alphabetSize = org.apache.lucene.util.TestUtil.nextInt(random(), 2, 7);
            int alphabetSize = TestUtil.Next(random(), 2, 7);

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final int docLen = atLeast(3000);
            int docLen = atLeast(3000);
            //final int docLen = 50;

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final String document = getRandomString('a', alphabetSize, docLen);
            string document = getRandomString('a', alphabetSize, docLen);

            if (VERBOSE)
            {
              Console.WriteLine("TEST: doc=" + document);
            }

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final int numSyn = atLeast(5);
            int numSyn = atLeast(5);
            //final int numSyn = 2;

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final java.util.Map<String,OneSyn> synMap = new java.util.HashMap<>();
            IDictionary<string, OneSyn> synMap = new Dictionary<string, OneSyn>();
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final java.util.List<OneSyn> syns = new java.util.ArrayList<>();
            IList<OneSyn> syns = new List<OneSyn>();
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final boolean dedup = random().nextBoolean();
            bool dedup = random().nextBoolean();
            if (VERBOSE)
            {
              Console.WriteLine("  dedup=" + dedup);
            }
            b = new SynonymMap.Builder(dedup);
            for (int synIDX = 0;synIDX < numSyn;synIDX++)
            {
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final String synIn = getRandomString('a', alphabetSize, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim();
              string synIn = getRandomString('a', alphabetSize, TestUtil.Next(random(), 1, 5)).Trim();
              OneSyn s = synMap[synIn];
              if (s == null)
              {
            s = new OneSyn();
            s.@in = synIn;
            syns.Add(s);
            s.@out = new List<>();
            synMap[synIn] = s;
            s.keepOrig = random().nextBoolean();
              }
            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final String synOut = getRandomString('0', 10, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim();
              string synOut = getRandomString('0', 10, TestUtil.Next(random(), 1, 5)).Trim();
              [email protected](synOut);
              add(synIn, synOut, s.keepOrig);
              if (VERBOSE)
              {
            Console.WriteLine("  syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig);
              }
            }

            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.reset();
            assertTrue(tokensIn.incrementToken());
            assertFalse(tokensIn.incrementToken());
            tokensIn.end();
            tokensIn.close();

            tokensOut = new SynonymFilter(tokensIn, b.build(), true);
            termAtt = tokensOut.addAttribute(typeof(CharTermAttribute));
            posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute));
            posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute));
            offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute));

            if (dedup)
            {
              pruneDups(syns);
            }

            //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
            //ORIGINAL LINE: final String expected = slowSynMatcher(document, syns, 5);
            string expected = slowSynMatcher(document, syns, 5);

            if (VERBOSE)
            {
              Console.WriteLine("TEST: expected=" + expected);
            }

            verify(document, expected);
        }
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testOutputHangsOffEnd() throws Exception
        public virtual void testOutputHangsOffEnd()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;
            // b hangs off the end (no input token under it):
            add("a", "a b", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.reset();
            assertTrue(tokensIn.incrementToken());
            assertFalse(tokensIn.incrementToken());
            tokensIn.end();
            tokensIn.close();

            tokensOut = new SynonymFilter(tokensIn, b.build(), true);
            termAtt = tokensOut.addAttribute(typeof(CharTermAttribute));
            posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute));
            offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute));
            posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute));

            // Make sure endOffset inherits from previous input token:
            verify("a", "a b:1");
        }
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testBasic2() throws Exception
        public virtual void testBasic2()
        {
            b = new SynonymMap.Builder(true);
            const bool keepOrig = false;
            add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
            add("bbb", "bbbb1 bbbb2", keepOrig);
            tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true);
            tokensIn.reset();
            assertTrue(tokensIn.incrementToken());
            assertFalse(tokensIn.incrementToken());
            tokensIn.end();
            tokensIn.close();

            tokensOut = new SynonymFilter(tokensIn, b.build(), true);
            termAtt = tokensOut.addAttribute(typeof(CharTermAttribute));
            posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute));
            posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute));
            offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute));

            if (keepOrig)
            {
              verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold");
              verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
            else
            {
              verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold");
              verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold");
            }
        }
            public override void CopyTo(Attribute target)
            {
                OffsetAttribute t = (OffsetAttribute)target;

                t.SetOffset(Start, End);
            }
Example #35
0
		public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
		{
			this.outerInstance = outerInstance;
		  this.attSource = attSource;
		  this.termAtt = attSource.getAttribute(typeof(CharTermAttribute));
		  this.offsetAtt = attSource.getAttribute(typeof(OffsetAttribute));
		}