public AnalyzerAnonymousInnerClassHelper2(TestMockAnalyzer outerInstance, CharacterRunAutomaton dfa, bool lowercase, int limit) { this.OuterInstance = outerInstance; this.Dfa = dfa; this.Lowercase = lowercase; this.Limit = limit; }
public virtual void TestLength() { CharacterRunAutomaton length5 = new CharacterRunAutomaton((new RegExp(".{5,}")).ToAutomaton()); Analyzer a = new MockAnalyzer(Random, MockTokenizer.WHITESPACE, true, length5); AssertAnalyzesTo(a, "ok toolong fine notfine", new string[] { "ok", "fine" }, new int[] { 1, 2 }); }
public virtual void TestKeep() { CharacterRunAutomaton keepWords = new CharacterRunAutomaton(BasicOperations.Complement(Automaton.Union(new Automaton[] { BasicAutomata.MakeString("foo"), BasicAutomata.MakeString("bar") }))); Analyzer a = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, keepWords); AssertAnalyzesTo(a, "quick foo brown bar bar fox foo", new string[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 }); }
/// <summary> /// checks condition of the concatenation of two strings </summary> // note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem // but this is a little bit more complicated. private bool CheckCondition(int condition, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) { if (condition != 0) { CharacterRunAutomaton pattern = dictionary.patterns[condition]; int state = pattern.InitialState; for (int i = c1off; i < c1off + c1len; i++) { state = pattern.Step(state, c1[i]); if (state == -1) { return(false); } } for (int i = c2off; i < c2off + c2len; i++) { state = pattern.Step(state, c2[i]); if (state == -1) { return(false); } } return(pattern.IsAccept(state)); } return(true); }
/// <summary> /// Create a new MockTokenFilter. /// </summary> /// <param name="input"> TokenStream to filter </param> /// <param name="filter"> DFA representing the terms that should be removed. </param> public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) : base(input) { this.Filter = filter; TermAtt = AddAttribute <ICharTermAttribute>(); PosIncrAtt = AddAttribute <IPositionIncrementAttribute>(); }
/// <summary> /// Create a new MockTokenFilter. /// </summary> /// <param name="input"> TokenStream to filter </param> /// <param name="filter"> DFA representing the terms that should be removed. </param> public MockTokenFilter(TokenStream input, CharacterRunAutomaton filter) : base(input) { this.Filter = filter; TermAtt = AddAttribute<ICharTermAttribute>(); PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
internal SimpleAutomatonTermsEnum(TestRegexpRandom2.DumbRegexpQuery outerInstance, TermsEnum tenum) : base(tenum) { this.outerInstance = outerInstance; runAutomaton = new CharacterRunAutomaton(outerInstance.automaton); SetInitialSeekTerm(new BytesRef("")); }
public virtual void TestSingleChar() { var single = new CharacterRunAutomaton((new RegExp(".")).ToAutomaton()); Analyzer a = new MockAnalyzer(Random, single, false); AssertAnalyzesTo(a, "foobar", new[] { "f", "o", "o", "b", "a", "r" }, new[] { 0, 1, 2, 3, 4, 5 }, new[] { 1, 2, 3, 4, 5, 6 }); CheckRandomData(Random, a, 100); }
/// <summary> /// Creates a new MockAnalyzer. /// </summary> /// <param name="random"> Random for payloads behavior </param> /// <param name="runAutomaton"> DFA describing how tokenization should happen (e.g. [a-zA-Z]+) </param> /// <param name="lowerCase"> true if the tokenizer should lowercase terms </param> /// <param name="filter"> DFA describing how terms should be filtered (set of stopwords, etc) </param> public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, bool lowerCase, CharacterRunAutomaton filter) : base(PER_FIELD_REUSE_STRATEGY) { // TODO: this should be solved in a different way; Random should not be shared (!). this.Random = new Random(random.Next()); this.RunAutomaton = runAutomaton; this.LowerCase = lowerCase; this.Filter = filter; }
public virtual void TestUppercase() { CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("[A-Z][a-z]*")).ToAutomaton()); Analyzer a = new MockAnalyzer(Random, single, false); AssertAnalyzesTo(a, "FooBarBAZ", new string[] { "Foo", "Bar", "B", "A", "Z" }, new int[] { 0, 3, 6, 7, 8 }, new int[] { 3, 6, 7, 8, 9 }); AssertAnalyzesTo(a, "aFooBar", new string[] { "Foo", "Bar" }, new int[] { 1, 4 }, new int[] { 4, 7 }); CheckRandomData(Random, a, 100); }
public virtual void TestThreeChars() { CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("...")).ToAutomaton()); Analyzer a = new MockAnalyzer(Random, single, false); AssertAnalyzesTo(a, "foobar", new string[] { "foo", "bar" }, new int[] { 0, 3 }, new int[] { 3, 6 }); // make sure when last term is a "partial" match that End() is correct AssertTokenStreamContents(a.GetTokenStream("bogus", new StringReader("fooba")), new string[] { "foo" }, new int[] { 0 }, new int[] { 3 }, new int[] { 1 }, new int?(5)); CheckRandomData(Random, a, 100); }
public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength) : base(factory, input) { this.RunAutomaton = runAutomaton; this.LowerCase = lowerCase; this.state = runAutomaton.InitialState; this.StreamState = State.SETREADER; this.MaxTokenLength = maxTokenLength; TermAtt = AddAttribute <ICharTermAttribute>(); OffsetAtt = AddAttribute <IOffsetAttribute>(); }
public virtual void TestPhraseQueryPositionIncrements() { PhraseQuery expected = new PhraseQuery(); expected.Add(new Term("field", "1")); expected.Add(new Term("field", "2"), 2); CharacterRunAutomaton stopList = new CharacterRunAutomaton((new RegExp("[sS][tT][oO][pP]")).ToAutomaton()); Analyzer analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false, stopList); QueryBuilder builder = new QueryBuilder(analyzer); Assert.AreEqual(expected, builder.CreatePhraseQuery("field", "1 stop 2")); }
public override void BeforeClass() { base.BeforeClass(); Random random = Random; m_directory = NewDirectory(); m_stopword = "" + GetRandomChar(); CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.MakeString(m_stopword)); m_analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset); RandomIndexWriter iw = new RandomIndexWriter(random, m_directory, m_analyzer); Document doc = new Document(); Field id = new StringField("id", "", Field.Store.NO); Field field = new TextField("field", "", Field.Store.NO); doc.Add(id); doc.Add(field); // index some docs int numDocs = AtLeast(1000); for (int i = 0; i < numDocs; i++) { id.SetStringValue(Convert.ToString(i, CultureInfo.InvariantCulture)); field.SetStringValue(RandomFieldContents()); iw.AddDocument(doc); } // delete some docs int numDeletes = numDocs / 20; for (int i = 0; i < numDeletes; i++) { Term toDelete = new Term("id", Convert.ToString(random.Next(numDocs), CultureInfo.InvariantCulture)); if (random.NextBoolean()) { iw.DeleteDocuments(toDelete); } else { iw.DeleteDocuments(new TermQuery(toDelete)); } } m_reader = iw.GetReader(); m_s1 = NewSearcher(m_reader); m_s2 = NewSearcher(m_reader); iw.Dispose(); }
public virtual void TestRandomRegexps() { int iters = AtLeast(30); for (int i = 0; i < iters; i++) { CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.RandomAutomaton(Random())); bool lowercase = Random().NextBoolean(); int limit = TestUtil.NextInt(Random(), 0, 500); Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, dfa, lowercase, limit); CheckRandomData(Random(), a, 100); a.Dispose(); } }
public void TestKeep() { CharacterRunAutomaton keepWords = new CharacterRunAutomaton( BasicOperations.Complement( BasicOperations.Union( BasicAutomata.MakeString("foo"), BasicAutomata.MakeString("bar")) /*, * Operations.DEFAULT_MAX_DETERMINIZED_STATES*/)); Analyzer a = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, keepWords); AssertAnalyzesTo(a, "quick foo brown bar bar fox foo", new String[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 }); }
public static void BeforeClass() { Random random = Random(); Directory = NewDirectory(); Stopword = "" + RandomChar(); CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.MakeString(Stopword)); Analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset); RandomIndexWriter iw = new RandomIndexWriter(random, Directory, Analyzer); Document doc = new Document(); Field id = new StringField("id", "", Field.Store.NO); Field field = new TextField("field", "", Field.Store.NO); doc.Add(id); doc.Add(field); // index some docs int numDocs = AtLeast(1000); for (int i = 0; i < numDocs; i++) { id.StringValue = Convert.ToString(i); field.StringValue = RandomFieldContents(); iw.AddDocument(doc); } // delete some docs int numDeletes = numDocs / 20; for (int i = 0; i < numDeletes; i++) { Term toDelete = new Term("id", Convert.ToString(random.Next(numDocs))); if (random.NextBoolean()) { iw.DeleteDocuments(toDelete); } else { iw.DeleteDocuments(new TermQuery(toDelete)); } } Reader = iw.Reader; S1 = NewSearcher(Reader); S2 = NewSearcher(Reader); iw.Dispose(); }
public void TestRandomRegexps() { int iters = TEST_NIGHTLY ? AtLeast(30) : AtLeast(1); for (int i = 0; i < iters; i++) { CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.RandomAutomaton(Random) /*, int.MaxValue*/); bool lowercase = Random.nextBoolean(); int limit = TestUtil.NextInt32(Random, 0, 500); Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit); return(new TokenStreamComponents(t, t)); }); CheckRandomData(Random, a, 100); a.Dispose(); } }
public void BeforeClass() { Random random = Random(); Directory = NewDirectory(); Stopword = "" + RandomChar(); CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.MakeString(Stopword)); Analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset); RandomIndexWriter iw = new RandomIndexWriter(random, Directory, Analyzer, ClassEnvRule.Similarity, ClassEnvRule.TimeZone); Document doc = new Document(); Field id = new StringField("id", "", Field.Store.NO); Field field = new TextField("field", "", Field.Store.NO); doc.Add(id); doc.Add(field); // index some docs int numDocs = AtLeast(1000); for (int i = 0; i < numDocs; i++) { id.StringValue = Convert.ToString(i); field.StringValue = RandomFieldContents(); iw.AddDocument(doc); } // delete some docs int numDeletes = numDocs / 20; for (int i = 0; i < numDeletes; i++) { Term toDelete = new Term("id", Convert.ToString(random.Next(numDocs))); if (random.NextBoolean()) { iw.DeleteDocuments(toDelete); } else { iw.DeleteDocuments(new TermQuery(toDelete)); } } Reader = iw.Reader; S1 = NewSearcher(Reader); S2 = NewSearcher(Reader); iw.Dispose(); }
public void TestRandomRegexps() { //int iters = TestNightly ? AtLeast(30) : AtLeast(1); // LUCENENET specific - reduced Nightly iterations from 30 to 15 // to keep it under the 1 hour free limit of Azure DevOps int iters = TestNightly ? AtLeast(15) : AtLeast(1); for (int i = 0; i < iters; i++) { CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.RandomAutomaton(Random) /*, int.MaxValue*/); bool lowercase = Random.NextBoolean(); int limit = TestUtil.NextInt32(Random, 0, 500); Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit); return(new TokenStreamComponents(t, t)); }); CheckRandomData(Random, a, 100); a.Dispose(); } }
public void TestTwoChars() { CharacterRunAutomaton single = new CharacterRunAutomaton(new RegExp("..").ToAutomaton()); Analyzer a = new MockAnalyzer(Random, single, false); AssertAnalyzesTo(a, "foobar", new String[] { "fo", "ob", "ar" }, new int[] { 0, 2, 4 }, new int[] { 2, 4, 6 } ); // make sure when last term is a "partial" match that end() is correct AssertTokenStreamContents(a.GetTokenStream("bogus", "fooba"), new String[] { "fo", "ob" }, new int[] { 0, 2 }, new int[] { 2, 4 }, new int[] { 1, 1 }, 5 ); CheckRandomData(Random, a, 100); }
public virtual void TestStartPositions() { Directory dir = NewDirectory(); // mimic StopAnalyzer CharacterRunAutomaton stopSet = new CharacterRunAutomaton((new RegExp("the|a|of")).ToAutomaton()); Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, stopSet); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir, analyzer); Document doc = new Document(); doc.Add(NewTextField("field", "the quick brown fox", Field.Store.NO)); writer.AddDocument(doc); Document doc2 = new Document(); doc2.Add(NewTextField("field", "quick brown fox", Field.Store.NO)); writer.AddDocument(doc2); IndexReader reader = writer.GetReader(); IndexSearcher searcher = NewSearcher(reader); // user queries on "starts-with quick" SpanQuery sfq = new SpanFirstQuery(new SpanTermQuery(new Term("field", "quick")), 1); Assert.AreEqual(1, searcher.Search(sfq, 10).TotalHits); // user queries on "starts-with the quick" SpanQuery include = new SpanFirstQuery(new SpanTermQuery(new Term("field", "quick")), 2); sfq = new SpanNotQuery(include, sfq); Assert.AreEqual(1, searcher.Search(sfq, 10).TotalHits); writer.Dispose(); reader.Dispose(); dir.Dispose(); }
public override void SetUp() { base.SetUp(); // build an automaton matching this jvm's letter definition State initial = new State(); State accept = new State(); accept.Accept = true; for (int i = 0; i <= 0x10FFFF; i++) { if (Character.IsLetter(i)) { initial.AddTransition(new Transition(i, i, accept)); } } Automaton single = new Automaton(initial); single.Reduce(); Automaton repeat = BasicOperations.Repeat(single); jvmLetter = new CharacterRunAutomaton(repeat); }
public void TestBoost() { CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.MakeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, stopSet); PrecedenceQueryParser qp = new PrecedenceQueryParser(); qp.Analyzer = (oneStopAnalyzer); Query q = qp.Parse("on^1.0", "field"); assertNotNull(q); q = qp.Parse("\"hello\"^2.0", "field"); assertNotNull(q); assertEquals(q.Boost, (float)2.0, (float)0.5); q = qp.Parse("hello^2.0", "field"); assertNotNull(q); assertEquals(q.Boost, (float)2.0, (float)0.5); q = qp.Parse("\"on\"^1.0", "field"); assertNotNull(q); q = GetParser(new MockAnalyzer(Random, MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).Parse("the^3", "field"); assertNotNull(q); }
private IDictionary <int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList <AtomicReaderContext> leaves, int maxPassages, Query query) { IDictionary <int, object> highlights = new Dictionary <int, object>(); PassageFormatter fieldFormatter = GetFormatter(field); if (fieldFormatter == null) { throw new NullReferenceException("PassageFormatter cannot be null"); } // check if we should do any multiterm processing Analyzer analyzer = GetIndexAnalyzer(field); CharacterRunAutomaton[] automata = new CharacterRunAutomaton[0]; if (analyzer != null) { automata = MultiTermHighlighting.ExtractAutomata(query, field); } // resize 'terms', where the last term is the multiterm matcher if (automata.Length > 0) { BytesRef[] newTerms = new BytesRef[terms.Length + 1]; System.Array.Copy(terms, 0, newTerms, 0, terms.Length); terms = newTerms; } // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes // otherwise, we will just advance() existing enums to the new document in the same segment. DocsAndPositionsEnum[] postings = null; TermsEnum termsEnum = null; int lastLeaf = -1; for (int i = 0; i < docids.Length; i++) { string content = contents[i]; if (content.Length == 0) { continue; // nothing to do } bi.SetText(content); int doc = docids[i]; int leaf = ReaderUtil.SubIndex(doc, leaves); AtomicReaderContext subContext = leaves[leaf]; AtomicReader r = subContext.AtomicReader; Debug.Assert(leaf >= lastLeaf); // increasing order // if the segment has changed, we must initialize new enums. if (leaf != lastLeaf) { Terms t = r.GetTerms(field); if (t != null) { termsEnum = t.GetIterator(null); postings = new DocsAndPositionsEnum[terms.Length]; } } if (termsEnum == null) { continue; // no terms for this field, nothing to do } // if there are multi-term matches, we have to initialize the "fake" enum for each document if (automata.Length > 0) { DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata); dp.Advance(doc - subContext.DocBase); postings[terms.Length - 1] = dp; // last term is the multiterm matcher } Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages); if (passages.Length == 0) { // no passages were returned, so ask for a default summary passages = GetEmptyHighlight(field, bi, maxPassages); } if (passages.Length > 0) { highlights[doc] = fieldFormatter.Format(passages, content); } lastLeaf = leaf; } return(highlights); }
public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase) : this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH) { }
public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength) : base(factory, input) { this.RunAutomaton = runAutomaton; this.LowerCase = lowerCase; this.state = runAutomaton.InitialState; this.StreamState = State.SETREADER; this.MaxTokenLength = maxTokenLength; TermAtt = AddAttribute<ICharTermAttribute>(); OffsetAtt = AddAttribute<IOffsetAttribute>(); }
public virtual void TestUppercase() { CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("[A-Z][a-z]*")).ToAutomaton()); Analyzer a = new MockAnalyzer(Random(), single, false); AssertAnalyzesTo(a, "FooBarBAZ", new string[] { "Foo", "Bar", "B", "A", "Z" }, new int[] { 0, 3, 6, 7, 8 }, new int[] { 3, 6, 7, 8, 9 }); AssertAnalyzesTo(a, "aFooBar", new string[] { "Foo", "Bar" }, new int[] { 1, 4 }, new int[] { 4, 7 }); CheckRandomData(Random(), a, 100); }
public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength) : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength) { }
private void InitializeInstanceFields() { RunAutomaton = new CharacterRunAutomaton(OuterInstance.Automaton); }
/// <summary> /// Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton) /// MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}). /// </summary> public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, bool lowerCase) : this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET) { }
/// <summary> /// Parses a specific affix rule putting the result into the provided affix map /// </summary> /// <param name="affixes"> Map where the result of the parsing will be put </param> /// <param name="header"> Header line of the affix rule </param> /// <param name="reader"> BufferedReader to read the content of the rule from </param> /// <param name="conditionPattern"> <seealso cref="String#format(String, Object...)"/> pattern to be used to generate the condition regex /// pattern </param> /// <param name="seenPatterns"> map from condition -> index of patterns, for deduplication. </param> /// <exception cref="IOException"> Can be thrown while reading the rule </exception> private void ParseAffix(SortedDictionary <string, IList <char?> > affixes, string header, TextReader reader, string conditionPattern, IDictionary <string, int?> seenPatterns, IDictionary <string, int?> seenStrips) { BytesRef scratch = new BytesRef(); StringBuilder sb = new StringBuilder(); string[] args = whitespacePattern.Split(header); bool crossProduct = args[2].Equals("Y"); int numLines = int.Parse(args[3], CultureInfo.InvariantCulture); affixData = ArrayUtil.Grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { Debug.Assert(affixWriter.Position == currentAffix << 3); string line = reader.ReadLine(); string[] ruleArgs = whitespacePattern.Split(line); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.Length < 4) { throw new Exception("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader } char flag = flagParsingStrategy.ParseFlag(ruleArgs[1]); string strip = ruleArgs[2].Equals("0") ? "" : ruleArgs[2]; string affixArg = ruleArgs[3]; char[] appendFlags = null; int flagSep = affixArg.LastIndexOf('/'); if (flagSep != -1) { string flagPart = affixArg.Substring(flagSep + 1); affixArg = affixArg.Substring(0, flagSep - 0); if (aliasCount > 0) { flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture)); } appendFlags = flagParsingStrategy.ParseFlags(flagPart); Array.Sort(appendFlags); twoStageAffix = true; } // TODO: add test and fix zero-affix handling! string condition = ruleArgs.Length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal)) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.IndexOf('-') >= 0) { condition = condition.Replace("-", "\\-"); } string regex; if (".".Equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.Equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = string.Format(CultureInfo.InvariantCulture, conditionPattern, condition); } // deduplicate patterns int?patternIndex = seenPatterns.ContainsKey(regex) ? seenPatterns[regex] : null; if (patternIndex == null) { patternIndex = patterns.Count; if (patternIndex > short.MaxValue) { throw new System.NotSupportedException("Too many patterns, please report this to [email protected]"); } seenPatterns[regex] = patternIndex; CharacterRunAutomaton pattern = new CharacterRunAutomaton((new RegExp(regex, RegExp.NONE)).ToAutomaton()); patterns.Add(pattern); } int?stripOrd = seenStrips.ContainsKey(strip) ? seenStrips[strip] : null; if (stripOrd == null) { stripOrd = seenStrips.Count; seenStrips[strip] = stripOrd; if (stripOrd > char.MaxValue) { throw new System.NotSupportedException("Too many unique strips, please report this to [email protected]"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } EncodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.Add(scratch); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > short.MaxValue) { // this limit is probably flexible, but its a good sanity check too throw new System.NotSupportedException("Too many unique append flags, please report this to [email protected]"); } affixWriter.WriteShort((short)flag); affixWriter.WriteShort((short)stripOrd); // encode crossProduct into patternIndex int patternOrd = (int)patternIndex << 1 | (crossProduct ? 1 : 0); affixWriter.WriteShort((short)patternOrd); affixWriter.WriteShort((short)appendFlagsOrd); if (needsInputCleaning) { string cleaned = CleanInput(affixArg, sb); affixArg = cleaned.ToString(); } IList <char?> list = affixes.ContainsKey(affixArg) ? affixes[affixArg] : null; if (list == null) { list = new List <char?>(); affixes[affixArg] = list; } list.Add((char)currentAffix); currentAffix++; } }
public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase) : this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH) { }
public virtual void TestLength() { CharacterRunAutomaton length5 = new CharacterRunAutomaton((new RegExp(".{5,}")).ToAutomaton()); Analyzer a = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, true, length5); AssertAnalyzesTo(a, "ok toolong fine notfine", new string[] { "ok", "fine" }, new int[] { 1, 2 }); }
public virtual void TestThreeChars() { CharacterRunAutomaton single = new CharacterRunAutomaton((new RegExp("...")).ToAutomaton()); Analyzer a = new MockAnalyzer(Random(), single, false); AssertAnalyzesTo(a, "foobar", new string[] { "foo", "bar" }, new int[] { 0, 3 }, new int[] { 3, 6 }); // make sure when last term is a "partial" match that End() is correct AssertTokenStreamContents(a.TokenStream("bogus", new StringReader("fooba")), new string[] { "foo" }, new int[] { 0 }, new int[] { 3 }, new int[] { 1 }, new int?(5)); CheckRandomData(Random(), a, 100); }
public virtual void TestKeep() { CharacterRunAutomaton keepWords = new CharacterRunAutomaton(BasicOperations.Complement(Automaton.Union(Arrays.AsList(BasicAutomata.MakeString("foo"), BasicAutomata.MakeString("bar"))))); Analyzer a = new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true, keepWords); AssertAnalyzesTo(a, "quick foo brown bar bar fox foo", new string[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 }); }
public virtual void TestSingleChar() { var single = new CharacterRunAutomaton((new RegExp(".")).ToAutomaton()); Analyzer a = new MockAnalyzer(Random(), single, false); AssertAnalyzesTo(a, "foobar", new[] { "f", "o", "o", "b", "a", "r" }, new[] { 0, 1, 2, 3, 4, 5 }, new[] { 1, 2, 3, 4, 5, 6 }); CheckRandomData(Random(), a, 100); }
public MockTokenizer(AttributeFactory factory, StreamReader input, CharacterRunAutomaton runAutomaton, bool lowerCase) : this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH) { }
internal virtual void InitializeInstanceFields() { RunAutomaton = new CharacterRunAutomaton(OuterInstance.Automaton); }