Пример #1
0
 public AnalyzerAnonymousInnerClassHelper2(TestMockAnalyzer outerInstance, CharacterRunAutomaton dfa, bool lowercase, int limit)
 {
     this.OuterInstance = outerInstance;
     this.Dfa           = dfa;
     this.Lowercase     = lowercase;
     this.Limit         = limit;
 }
Пример #2
0
        public virtual void TestLength()
        {
            CharacterRunAutomaton length5 = new CharacterRunAutomaton((new RegExp(".{5,}")).ToAutomaton());
            Analyzer a = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, true, length5);

            AssertAnalyzesTo(a, "ok toolong fine notfine", new string[] { "ok", "fine" }, new int[] { 1, 2 });
        }
Пример #3
0
        public virtual void TestKeep()
        {
            CharacterRunAutomaton keepWords = new CharacterRunAutomaton(BasicOperations.Complement(Automaton.Union(new Automaton[] { BasicAutomata.MakeString("foo"), BasicAutomata.MakeString("bar") })));
            Analyzer a = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, keepWords);

            AssertAnalyzesTo(a, "quick foo brown bar bar fox foo", new string[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 });
        }
Пример #4
0
 /// <summary>
 /// checks condition of the concatenation of two strings </summary>
 // note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem
 // but this is a little bit more complicated.
 private bool CheckCondition(int condition, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len)
 {
     if (condition != 0)
     {
         CharacterRunAutomaton pattern = dictionary.patterns[condition];
         int state = pattern.InitialState;
         for (int i = c1off; i < c1off + c1len; i++)
         {
             state = pattern.Step(state, c1[i]);
             if (state == -1)
             {
                 return(false);
             }
         }
         for (int i = c2off; i < c2off + c2len; i++)
         {
             state = pattern.Step(state, c2[i]);
             if (state == -1)
             {
                 return(false);
             }
         }
         return(pattern.IsAccept(state));
     }
     return(true);
 }
Пример #5
0
 /// <summary>
 /// Create a new MockTokenFilter.
 /// </summary>
 /// <param name="input"> TokenStream to filter </param>
 /// <param name="filter"> DFA representing the terms that should be removed. </param>
 public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter)
     : base(input)
 {
     this.Filter = filter;
     TermAtt     = AddAttribute <ICharTermAttribute>();
     PosIncrAtt  = AddAttribute <IPositionIncrementAttribute>();
 }
Пример #6
0
 /// <summary>
 /// Create a new MockTokenFilter.
 /// </summary>
 /// <param name="input"> TokenStream to filter </param>
 /// <param name="filter"> DFA representing the terms that should be removed. </param>
 public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter)
     : base(input)
 {
     this.Filter = filter;
     TermAtt = AddAttribute<ICharTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
 }
Пример #7
0
                internal SimpleAutomatonTermsEnum(TestRegexpRandom2.DumbRegexpQuery outerInstance, TermsEnum tenum)
                    : base(tenum)
                {
                    this.outerInstance = outerInstance;

                    runAutomaton = new CharacterRunAutomaton(outerInstance.automaton);
                    SetInitialSeekTerm(new BytesRef(""));
                }
Пример #8
0
        public virtual void TestSingleChar()
        {
            var      single = new CharacterRunAutomaton((new RegExp(".")).ToAutomaton());
            Analyzer a      = new MockAnalyzer(Random, single, false);

            AssertAnalyzesTo(a, "foobar", new[] { "f", "o", "o", "b", "a", "r" }, new[] { 0, 1, 2, 3, 4, 5 }, new[] { 1, 2, 3, 4, 5, 6 });
            CheckRandomData(Random, a, 100);
        }
Пример #9
0
 /// <summary>
 /// Creates a new MockAnalyzer.
 /// </summary>
 /// <param name="random"> Random for payloads behavior </param>
 /// <param name="runAutomaton"> DFA describing how tokenization should happen (e.g. [a-zA-Z]+) </param>
 /// <param name="lowerCase"> true if the tokenizer should lowercase terms </param>
 /// <param name="filter"> DFA describing how terms should be filtered (set of stopwords, etc) </param>
 public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, bool lowerCase, CharacterRunAutomaton filter)
     : base(PER_FIELD_REUSE_STRATEGY)
 {
     // TODO: this should be solved in a different way; Random should not be shared (!).
     this.Random = new Random(random.Next());
     this.RunAutomaton = runAutomaton;
     this.LowerCase = lowerCase;
     this.Filter = filter;
 }
Пример #10
0
 /// <summary>
 /// Creates a new MockAnalyzer.
 /// </summary>
 /// <param name="random"> Random for payloads behavior </param>
 /// <param name="runAutomaton"> DFA describing how tokenization should happen (e.g. [a-zA-Z]+) </param>
 /// <param name="lowerCase"> true if the tokenizer should lowercase terms </param>
 /// <param name="filter"> DFA describing how terms should be filtered (set of stopwords, etc) </param>
 public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, bool lowerCase, CharacterRunAutomaton filter)
     : base(PER_FIELD_REUSE_STRATEGY)
 {
     // TODO: this should be solved in a different way; Random should not be shared (!).
     this.Random       = new Random(random.Next());
     this.RunAutomaton = runAutomaton;
     this.LowerCase    = lowerCase;
     this.Filter       = filter;
 }
Пример #11
0
        public virtual void TestUppercase()
        {
            CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("[A-Z][a-z]*")).ToAutomaton());
            Analyzer a = new MockAnalyzer(Random, single, false);

            AssertAnalyzesTo(a, "FooBarBAZ", new string[] { "Foo", "Bar", "B", "A", "Z" }, new int[] { 0, 3, 6, 7, 8 }, new int[] { 3, 6, 7, 8, 9 });
            AssertAnalyzesTo(a, "aFooBar", new string[] { "Foo", "Bar" }, new int[] { 1, 4 }, new int[] { 4, 7 });
            CheckRandomData(Random, a, 100);
        }
Пример #12
0
        public virtual void TestThreeChars()
        {
            CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("...")).ToAutomaton());
            Analyzer a = new MockAnalyzer(Random, single, false);

            AssertAnalyzesTo(a, "foobar", new string[] { "foo", "bar" }, new int[] { 0, 3 }, new int[] { 3, 6 });
            // make sure when last term is a "partial" match that End() is correct
            AssertTokenStreamContents(a.GetTokenStream("bogus", new StringReader("fooba")), new string[] { "foo" }, new int[] { 0 }, new int[] { 3 }, new int[] { 1 }, new int?(5));
            CheckRandomData(Random, a, 100);
        }
Пример #13
0
 public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
     : base(factory, input)
 {
     this.RunAutomaton   = runAutomaton;
     this.LowerCase      = lowerCase;
     this.state          = runAutomaton.InitialState;
     this.StreamState    = State.SETREADER;
     this.MaxTokenLength = maxTokenLength;
     TermAtt             = AddAttribute <ICharTermAttribute>();
     OffsetAtt           = AddAttribute <IOffsetAttribute>();
 }
Пример #14
0
        public virtual void TestPhraseQueryPositionIncrements()
        {
            PhraseQuery expected = new PhraseQuery();
            expected.Add(new Term("field", "1"));
            expected.Add(new Term("field", "2"), 2);

            CharacterRunAutomaton stopList = new CharacterRunAutomaton((new RegExp("[sS][tT][oO][pP]")).ToAutomaton());

            Analyzer analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false, stopList);

            QueryBuilder builder = new QueryBuilder(analyzer);
            Assert.AreEqual(expected, builder.CreatePhraseQuery("field", "1 stop 2"));
        }
Пример #15
0
        public override void BeforeClass()
        {
            base.BeforeClass();


            Random random = Random;

            m_directory = NewDirectory();
            m_stopword  = "" + GetRandomChar();
            CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.MakeString(m_stopword));

            m_analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
            RandomIndexWriter iw    = new RandomIndexWriter(random, m_directory, m_analyzer);
            Document          doc   = new Document();
            Field             id    = new StringField("id", "", Field.Store.NO);
            Field             field = new TextField("field", "", Field.Store.NO);

            doc.Add(id);
            doc.Add(field);

            // index some docs
            int numDocs = AtLeast(1000);

            for (int i = 0; i < numDocs; i++)
            {
                id.SetStringValue(Convert.ToString(i, CultureInfo.InvariantCulture));
                field.SetStringValue(RandomFieldContents());
                iw.AddDocument(doc);
            }

            // delete some docs
            int numDeletes = numDocs / 20;

            for (int i = 0; i < numDeletes; i++)
            {
                Term toDelete = new Term("id", Convert.ToString(random.Next(numDocs), CultureInfo.InvariantCulture));
                if (random.NextBoolean())
                {
                    iw.DeleteDocuments(toDelete);
                }
                else
                {
                    iw.DeleteDocuments(new TermQuery(toDelete));
                }
            }

            m_reader = iw.GetReader();
            m_s1     = NewSearcher(m_reader);
            m_s2     = NewSearcher(m_reader);
            iw.Dispose();
        }
Пример #16
0
        public virtual void TestRandomRegexps()
        {
            int iters = AtLeast(30);

            for (int i = 0; i < iters; i++)
            {
                CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.RandomAutomaton(Random()));
                bool     lowercase        = Random().NextBoolean();
                int      limit            = TestUtil.NextInt(Random(), 0, 500);
                Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, dfa, lowercase, limit);
                CheckRandomData(Random(), a, 100);
                a.Dispose();
            }
        }
Пример #17
0
        public void TestKeep()
        {
            CharacterRunAutomaton keepWords =
                new CharacterRunAutomaton(
                    BasicOperations.Complement(
                        BasicOperations.Union(
                            BasicAutomata.MakeString("foo"), BasicAutomata.MakeString("bar")) /*,
                                                                                               * Operations.DEFAULT_MAX_DETERMINIZED_STATES*/));
            Analyzer a = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, keepWords);

            AssertAnalyzesTo(a, "quick foo brown bar bar fox foo",
                             new String[] { "foo", "bar", "bar", "foo" },
                             new int[] { 2, 2, 1, 2 });
        }
        public static void BeforeClass()
        {
            Random random = Random();

            Directory = NewDirectory();
            Stopword  = "" + RandomChar();
            CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.MakeString(Stopword));

            Analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
            RandomIndexWriter iw    = new RandomIndexWriter(random, Directory, Analyzer);
            Document          doc   = new Document();
            Field             id    = new StringField("id", "", Field.Store.NO);
            Field             field = new TextField("field", "", Field.Store.NO);

            doc.Add(id);
            doc.Add(field);

            // index some docs
            int numDocs = AtLeast(1000);

            for (int i = 0; i < numDocs; i++)
            {
                id.StringValue    = Convert.ToString(i);
                field.StringValue = RandomFieldContents();
                iw.AddDocument(doc);
            }

            // delete some docs
            int numDeletes = numDocs / 20;

            for (int i = 0; i < numDeletes; i++)
            {
                Term toDelete = new Term("id", Convert.ToString(random.Next(numDocs)));
                if (random.NextBoolean())
                {
                    iw.DeleteDocuments(toDelete);
                }
                else
                {
                    iw.DeleteDocuments(new TermQuery(toDelete));
                }
            }

            Reader = iw.Reader;
            S1     = NewSearcher(Reader);
            S2     = NewSearcher(Reader);
            iw.Dispose();
        }
Пример #19
0
        public void TestRandomRegexps()
        {
            int iters = TEST_NIGHTLY ? AtLeast(30) : AtLeast(1);

            for (int i = 0; i < iters; i++)
            {
                CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.RandomAutomaton(Random) /*, int.MaxValue*/);
                bool     lowercase        = Random.nextBoolean();
                int      limit            = TestUtil.NextInt32(Random, 0, 500);
                Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => {
                    Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit);
                    return(new TokenStreamComponents(t, t));
                });
                CheckRandomData(Random, a, 100);
                a.Dispose();
            }
        }
        public void BeforeClass()
        {
            Random random = Random();
            Directory = NewDirectory();
            Stopword = "" + RandomChar();
            CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.MakeString(Stopword));
            Analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
            RandomIndexWriter iw = new RandomIndexWriter(random, Directory, Analyzer, ClassEnvRule.Similarity, ClassEnvRule.TimeZone);
            Document doc = new Document();
            Field id = new StringField("id", "", Field.Store.NO);
            Field field = new TextField("field", "", Field.Store.NO);
            doc.Add(id);
            doc.Add(field);

            // index some docs
            int numDocs = AtLeast(1000);
            for (int i = 0; i < numDocs; i++)
            {
                id.StringValue = Convert.ToString(i);
                field.StringValue = RandomFieldContents();
                iw.AddDocument(doc);
            }

            // delete some docs
            int numDeletes = numDocs / 20;
            for (int i = 0; i < numDeletes; i++)
            {
                Term toDelete = new Term("id", Convert.ToString(random.Next(numDocs)));
                if (random.NextBoolean())
                {
                    iw.DeleteDocuments(toDelete);
                }
                else
                {
                    iw.DeleteDocuments(new TermQuery(toDelete));
                }
            }

            Reader = iw.Reader;
            S1 = NewSearcher(Reader);
            S2 = NewSearcher(Reader);
            iw.Dispose();
        }
Пример #21
0
        public void TestRandomRegexps()
        {
            //int iters = TestNightly ? AtLeast(30) : AtLeast(1);
            // LUCENENET specific - reduced Nightly iterations from 30 to 15
            // to keep it under the 1 hour free limit of Azure DevOps
            int iters = TestNightly ? AtLeast(15) : AtLeast(1);

            for (int i = 0; i < iters; i++)
            {
                CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.RandomAutomaton(Random) /*, int.MaxValue*/);
                bool     lowercase        = Random.NextBoolean();
                int      limit            = TestUtil.NextInt32(Random, 0, 500);
                Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => {
                    Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit);
                    return(new TokenStreamComponents(t, t));
                });
                CheckRandomData(Random, a, 100);
                a.Dispose();
            }
        }
Пример #22
0
        public void TestTwoChars()
        {
            CharacterRunAutomaton single =
                new CharacterRunAutomaton(new RegExp("..").ToAutomaton());
            Analyzer a = new MockAnalyzer(Random, single, false);

            AssertAnalyzesTo(a, "foobar",
                             new String[] { "fo", "ob", "ar" },
                             new int[] { 0, 2, 4 },
                             new int[] { 2, 4, 6 }
                             );
            // make sure when last term is a "partial" match that end() is correct
            AssertTokenStreamContents(a.GetTokenStream("bogus", "fooba"),
                                      new String[] { "fo", "ob" },
                                      new int[] { 0, 2 },
                                      new int[] { 2, 4 },
                                      new int[] { 1, 1 },
                                      5
                                      );
            CheckRandomData(Random, a, 100);
        }
Пример #23
0
        public virtual void TestStartPositions()
        {
            Directory dir = NewDirectory();

            // mimic StopAnalyzer
            CharacterRunAutomaton stopSet = new CharacterRunAutomaton((new RegExp("the|a|of")).ToAutomaton());
            Analyzer analyzer             = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, stopSet);

            RandomIndexWriter writer = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir, analyzer);
            Document doc = new Document();

            doc.Add(NewTextField("field", "the quick brown fox", Field.Store.NO));
            writer.AddDocument(doc);
            Document doc2 = new Document();

            doc2.Add(NewTextField("field", "quick brown fox", Field.Store.NO));
            writer.AddDocument(doc2);

            IndexReader   reader   = writer.GetReader();
            IndexSearcher searcher = NewSearcher(reader);

            // user queries on "starts-with quick"
            SpanQuery sfq = new SpanFirstQuery(new SpanTermQuery(new Term("field", "quick")), 1);

            Assert.AreEqual(1, searcher.Search(sfq, 10).TotalHits);

            // user queries on "starts-with the quick"
            SpanQuery include = new SpanFirstQuery(new SpanTermQuery(new Term("field", "quick")), 2);

            sfq = new SpanNotQuery(include, sfq);
            Assert.AreEqual(1, searcher.Search(sfq, 10).TotalHits);

            writer.Dispose();
            reader.Dispose();
            dir.Dispose();
        }
        public override void SetUp()
        {
            base.SetUp();
            // build an automaton matching this jvm's letter definition
            State initial = new State();
            State accept  = new State();

            accept.Accept = true;
            for (int i = 0; i <= 0x10FFFF; i++)
            {
                if (Character.IsLetter(i))
                {
                    initial.AddTransition(new Transition(i, i, accept));
                }
            }
            Automaton single = new Automaton(initial);

            single.Reduce();
            Automaton repeat = BasicOperations.Repeat(single);

            jvmLetter = new CharacterRunAutomaton(repeat);
        }
Пример #25
0
        public void TestBoost()
        {
            CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.MakeString("on"));
            Analyzer oneStopAnalyzer      = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, stopSet);

            PrecedenceQueryParser qp = new PrecedenceQueryParser();

            qp.Analyzer = (oneStopAnalyzer);
            Query q = qp.Parse("on^1.0", "field");

            assertNotNull(q);
            q = qp.Parse("\"hello\"^2.0", "field");
            assertNotNull(q);
            assertEquals(q.Boost, (float)2.0, (float)0.5);
            q = qp.Parse("hello^2.0", "field");
            assertNotNull(q);
            assertEquals(q.Boost, (float)2.0, (float)0.5);
            q = qp.Parse("\"on\"^1.0", "field");
            assertNotNull(q);

            q = GetParser(new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).Parse("the^3",
                                                                                                                       "field");
            assertNotNull(q);
        }
Пример #26
0
        private IDictionary <int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList <AtomicReaderContext> leaves, int maxPassages, Query query)
        {
            IDictionary <int, object> highlights = new Dictionary <int, object>();

            PassageFormatter fieldFormatter = GetFormatter(field);

            if (fieldFormatter == null)
            {
                throw new NullReferenceException("PassageFormatter cannot be null");
            }

            // check if we should do any multiterm processing
            Analyzer analyzer = GetIndexAnalyzer(field);

            CharacterRunAutomaton[] automata = new CharacterRunAutomaton[0];
            if (analyzer != null)
            {
                automata = MultiTermHighlighting.ExtractAutomata(query, field);
            }

            // resize 'terms', where the last term is the multiterm matcher
            if (automata.Length > 0)
            {
                BytesRef[] newTerms = new BytesRef[terms.Length + 1];
                System.Array.Copy(terms, 0, newTerms, 0, terms.Length);
                terms = newTerms;
            }

            // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes
            // otherwise, we will just advance() existing enums to the new document in the same segment.
            DocsAndPositionsEnum[] postings = null;
            TermsEnum termsEnum             = null;
            int       lastLeaf = -1;

            for (int i = 0; i < docids.Length; i++)
            {
                string content = contents[i];
                if (content.Length == 0)
                {
                    continue; // nothing to do
                }
                bi.SetText(content);
                int doc  = docids[i];
                int leaf = ReaderUtil.SubIndex(doc, leaves);
                AtomicReaderContext subContext = leaves[leaf];
                AtomicReader        r          = subContext.AtomicReader;

                Debug.Assert(leaf >= lastLeaf); // increasing order

                // if the segment has changed, we must initialize new enums.
                if (leaf != lastLeaf)
                {
                    Terms t = r.GetTerms(field);
                    if (t != null)
                    {
                        termsEnum = t.GetIterator(null);
                        postings  = new DocsAndPositionsEnum[terms.Length];
                    }
                }
                if (termsEnum == null)
                {
                    continue; // no terms for this field, nothing to do
                }

                // if there are multi-term matches, we have to initialize the "fake" enum for each document
                if (automata.Length > 0)
                {
                    DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata);
                    dp.Advance(doc - subContext.DocBase);
                    postings[terms.Length - 1] = dp; // last term is the multiterm matcher
                }

                Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages);

                if (passages.Length == 0)
                {
                    // no passages were returned, so ask for a default summary
                    passages = GetEmptyHighlight(field, bi, maxPassages);
                }

                if (passages.Length > 0)
                {
                    highlights[doc] = fieldFormatter.Format(passages, content);
                }

                lastLeaf = leaf;
            }

            return(highlights);
        }
Пример #27
0
 public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
     : this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
 {
 }
Пример #28
0
 public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
     : base(factory, input)
 {
     this.RunAutomaton = runAutomaton;
     this.LowerCase = lowerCase;
     this.state = runAutomaton.InitialState;
     this.StreamState = State.SETREADER;
     this.MaxTokenLength = maxTokenLength;
     TermAtt = AddAttribute<ICharTermAttribute>();
     OffsetAtt = AddAttribute<IOffsetAttribute>();
 }
Пример #29
0
 public virtual void TestUppercase()
 {
     CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("[A-Z][a-z]*")).ToAutomaton());
     Analyzer a = new MockAnalyzer(Random(), single, false);
     AssertAnalyzesTo(a, "FooBarBAZ", new string[] { "Foo", "Bar", "B", "A", "Z" }, new int[] { 0, 3, 6, 7, 8 }, new int[] { 3, 6, 7, 8, 9 });
     AssertAnalyzesTo(a, "aFooBar", new string[] { "Foo", "Bar" }, new int[] { 1, 4 }, new int[] { 4, 7 });
     CheckRandomData(Random(), a, 100);
 }
Пример #30
0
 public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
     : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength)
 {
 }
Пример #31
0
 private void InitializeInstanceFields()
 {
     RunAutomaton = new CharacterRunAutomaton(OuterInstance.Automaton);
 }
Пример #32
0
 /// <summary>
 /// Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
 /// MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
 /// </summary>
 public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, bool lowerCase)
     : this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET)
 {
 }
Пример #33
0
        /// <summary>
        /// Parses a specific affix rule putting the result into the provided affix map
        /// </summary>
        /// <param name="affixes"> Map where the result of the parsing will be put </param>
        /// <param name="header"> Header line of the affix rule </param>
        /// <param name="reader"> BufferedReader to read the content of the rule from </param>
        /// <param name="conditionPattern"> <seealso cref="String#format(String, Object...)"/> pattern to be used to generate the condition regex
        ///                         pattern </param>
        /// <param name="seenPatterns"> map from condition -> index of patterns, for deduplication. </param>
        /// <exception cref="IOException"> Can be thrown while reading the rule </exception>
        private void ParseAffix(SortedDictionary <string, IList <char?> > affixes, string header, TextReader reader, string conditionPattern, IDictionary <string, int?> seenPatterns, IDictionary <string, int?> seenStrips)
        {
            BytesRef      scratch = new BytesRef();
            StringBuilder sb      = new StringBuilder();

            string[] args = whitespacePattern.Split(header);

            bool crossProduct = args[2].Equals("Y");

            int numLines = int.Parse(args[3], CultureInfo.InvariantCulture);

            affixData = ArrayUtil.Grow(affixData, (currentAffix << 3) + (numLines << 3));
            ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

            for (int i = 0; i < numLines; i++)
            {
                Debug.Assert(affixWriter.Position == currentAffix << 3);
                string   line     = reader.ReadLine();
                string[] ruleArgs = whitespacePattern.Split(line);

                // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
                // condition is optional
                if (ruleArgs.Length < 4)
                {
                    throw new Exception("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader
                }

                char   flag        = flagParsingStrategy.ParseFlag(ruleArgs[1]);
                string strip       = ruleArgs[2].Equals("0") ? "" : ruleArgs[2];
                string affixArg    = ruleArgs[3];
                char[] appendFlags = null;

                int flagSep = affixArg.LastIndexOf('/');
                if (flagSep != -1)
                {
                    string flagPart = affixArg.Substring(flagSep + 1);
                    affixArg = affixArg.Substring(0, flagSep - 0);

                    if (aliasCount > 0)
                    {
                        flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture));
                    }

                    appendFlags = flagParsingStrategy.ParseFlags(flagPart);
                    Array.Sort(appendFlags);
                    twoStageAffix = true;
                }

                // TODO: add test and fix zero-affix handling!

                string condition = ruleArgs.Length > 4 ? ruleArgs[4] : ".";
                // at least the gascon affix file has this issue
                if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal))
                {
                    condition = condition + "]";
                }
                // "dash hasn't got special meaning" (we must escape it)
                if (condition.IndexOf('-') >= 0)
                {
                    condition = condition.Replace("-", "\\-");
                }

                string regex;
                if (".".Equals(condition))
                {
                    regex = ".*"; // Zero condition is indicated by dot
                }
                else if (condition.Equals(strip))
                {
                    regex = ".*"; // TODO: optimize this better:
                                  // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                                  // but this is complicated...
                }
                else
                {
                    regex = string.Format(CultureInfo.InvariantCulture, conditionPattern, condition);
                }

                // deduplicate patterns
                int?patternIndex = seenPatterns.ContainsKey(regex) ? seenPatterns[regex] : null;
                if (patternIndex == null)
                {
                    patternIndex = patterns.Count;
                    if (patternIndex > short.MaxValue)
                    {
                        throw new System.NotSupportedException("Too many patterns, please report this to [email protected]");
                    }
                    seenPatterns[regex] = patternIndex;
                    CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExp.NONE)).ToAutomaton());
                    patterns.Add(pattern);
                }

                int?stripOrd = seenStrips.ContainsKey(strip) ? seenStrips[strip] : null;
                if (stripOrd == null)
                {
                    stripOrd          = seenStrips.Count;
                    seenStrips[strip] = stripOrd;
                    if (stripOrd > char.MaxValue)
                    {
                        throw new System.NotSupportedException("Too many unique strips, please report this to [email protected]");
                    }
                }

                if (appendFlags == null)
                {
                    appendFlags = NOFLAGS;
                }

                EncodeFlags(scratch, appendFlags);
                int appendFlagsOrd = flagLookup.Add(scratch);
                if (appendFlagsOrd < 0)
                {
                    // already exists in our hash
                    appendFlagsOrd = (-appendFlagsOrd) - 1;
                }
                else if (appendFlagsOrd > short.MaxValue)
                {
                    // this limit is probably flexible, but its a good sanity check too
                    throw new System.NotSupportedException("Too many unique append flags, please report this to [email protected]");
                }

                affixWriter.WriteShort((short)flag);
                affixWriter.WriteShort((short)stripOrd);
                // encode crossProduct into patternIndex
                int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0);
                affixWriter.WriteShort((short)patternOrd);
                affixWriter.WriteShort((short)appendFlagsOrd);

                if (needsInputCleaning)
                {
                    string cleaned = CleanInput(affixArg, sb);
                    affixArg = cleaned.ToString();
                }

                IList <char?> list = affixes.ContainsKey(affixArg) ? affixes[affixArg] : null;
                if (list == null)
                {
                    list = new List <char?>();
                    affixes[affixArg] = list;
                }

                list.Add((char)currentAffix);
                currentAffix++;
            }
        }
Пример #34
0
 public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
     : this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
 {
 }
Пример #35
0
 public virtual void TestLength()
 {
     CharacterRunAutomaton length5 = new CharacterRunAutomaton((new RegExp(".{5,}")).ToAutomaton());
     Analyzer a = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, true, length5);
     AssertAnalyzesTo(a, "ok toolong fine notfine", new string[] { "ok", "fine" }, new int[] { 1, 2 });
 }
Пример #36
0
 public virtual void TestThreeChars()
 {
     CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("...")).ToAutomaton());
     Analyzer a = new MockAnalyzer(Random(), single, false);
     AssertAnalyzesTo(a, "foobar", new string[] { "foo", "bar" }, new int[] { 0, 3 }, new int[] { 3, 6 });
     // make sure when last term is a "partial" match that End() is correct
     AssertTokenStreamContents(a.TokenStream("bogus", new StringReader("fooba")), new string[] { "foo" }, new int[] { 0 }, new int[] { 3 }, new int[] { 1 }, new int?(5));
     CheckRandomData(Random(), a, 100);
 }
Пример #37
0
 public virtual void TestKeep()
 {
     CharacterRunAutomaton keepWords = new CharacterRunAutomaton(BasicOperations.Complement(Automaton.Union(Arrays.AsList(BasicAutomata.MakeString("foo"), BasicAutomata.MakeString("bar")))));
     Analyzer a = new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true, keepWords);
     AssertAnalyzesTo(a, "quick foo brown bar bar fox foo", new string[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 });
 }
Пример #38
0
 public virtual void TestSingleChar()
 {
     var single = new CharacterRunAutomaton((new RegExp(".")).ToAutomaton());
     Analyzer a = new MockAnalyzer(Random(), single, false);
     AssertAnalyzesTo(a, "foobar", new[] { "f", "o", "o", "b", "a", "r" }, new[] { 0, 1, 2, 3, 4, 5 }, new[] { 1, 2, 3, 4, 5, 6 });
     CheckRandomData(Random(), a, 100);
 }
Пример #39
0
        public virtual void TestPhraseQueryPositionIncrements()
        {
            PhraseQuery expected = new PhraseQuery();
            expected.Add(new Term("field", "1"));
            expected.Add(new Term("field", "2"), 2);

            CharacterRunAutomaton stopList = new CharacterRunAutomaton((new RegExp("[sS][tT][oO][pP]")).ToAutomaton());

            Analyzer analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false, stopList);

            QueryBuilder builder = new QueryBuilder(analyzer);
            Assert.AreEqual(expected, builder.CreatePhraseQuery("field", "1 stop 2"));
        }
Пример #40
0
 public AnalyzerAnonymousInnerClassHelper2(TestMockAnalyzer outerInstance, CharacterRunAutomaton dfa, bool lowercase, int limit)
 {
     this.OuterInstance = outerInstance;
     this.Dfa = dfa;
     this.Lowercase = lowercase;
     this.Limit = limit;
 }
Пример #41
0
 public MockTokenizer(AttributeFactory factory, StreamReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
     : this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
 {
 }
Пример #42
0
 public virtual void TestRandomRegexps()
 {
     int iters = AtLeast(30);
     for (int i = 0; i < iters; i++)
     {
         CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.RandomAutomaton(Random()));
         bool lowercase = Random().NextBoolean();
         int limit = TestUtil.NextInt(Random(), 0, 500);
         Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, dfa, lowercase, limit);
         CheckRandomData(Random(), a, 100);
         a.Dispose();
     }
 }
Пример #43
0
 public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
     : this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
 {
 }
Пример #44
0
 public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
     : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength)
 {
 }
Пример #45
0
 internal virtual void InitializeInstanceFields()
 {
     RunAutomaton = new CharacterRunAutomaton(OuterInstance.Automaton);
 }
Пример #46
0
 internal virtual void InitializeInstanceFields()
 {
     RunAutomaton = new CharacterRunAutomaton(OuterInstance.Automaton);
 }
Пример #47
0
 private void InitializeInstanceFields()
 {
     RunAutomaton = new CharacterRunAutomaton(OuterInstance.Automaton);
 }
Пример #48
0
 /// <summary>
 /// Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
 /// MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
 /// </summary>
 public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, bool lowerCase)
     : this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET)
 {
 }