Пример #1
0
        /// <param name="singleMatch">  List<String>, the sequence of strings to match </param>
        /// <param name="replacement">  List<Token> the list of tokens to use on a match </param>
        /// <param name="includeOrig">  sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens </param>
        /// <param name="mergeExisting"> merge the replacement tokens with any other mappings that exist </param>
        public virtual void Add(IList<string> singleMatch, IList<Token> replacement, bool includeOrig, bool mergeExisting)
        {
            var currMap = this;
            foreach (string str in singleMatch)
            {
                if (currMap.submap == null)
                {
                    // for now hardcode at 4.0, as its what the old code did.
                    // would be nice to fix, but shouldn't store a version in each submap!!!
                    currMap.submap = new CharArrayMap<SlowSynonymMap>(Lucene.Net.Util.Version.LUCENE_CURRENT, 1, IgnoreCase());
                }

                var map = currMap.submap.Get(str);
                if (map == null)
                {
                    map = new SlowSynonymMap();
                    map.flags |= flags & IGNORE_CASE;
                    currMap.submap.put(str, map);
                }

                currMap = map;
            }

            if (currMap.synonyms != null && !mergeExisting)
            {
                throw new System.ArgumentException("SynonymFilter: there is already a mapping for " + singleMatch);
            }
            IList<Token> superset = currMap.synonyms == null ? replacement : MergeTokens(currMap.synonyms, replacement);
            currMap.synonyms = superset.ToArray();
            if (includeOrig)
            {
                currMap.flags |= INCLUDE_ORIG;
            }
        }
        public virtual void TestMultiWordSynonymsOld()
        {
            IList<string> rules = new List<string>();
            rules.Add("a b c,d");
            SlowSynonymMap synMap = new SlowSynonymMap(true);
            SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null);

            SlowSynonymFilter ts = new SlowSynonymFilter(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false), synMap);
            // This fails because ["e","e"] is the value of the token stream
            AssertTokenStreamContents(ts, new string[] { "a", "e" });
        }
Пример #3
0
	  private IEnumerator<AttributeSource> replacement; // iterator over generated tokens

	  public SlowSynonymFilter(TokenStream @in, SlowSynonymMap map) : base(@in)
	  {
		if (map == null)
		{
		  throw new System.ArgumentException("map is required", "map");
		}

		this.map = map;
		// just ensuring these attributes exist...
	      AddAttribute<ICharTermAttribute>();
          AddAttribute < IPositionIncrementAttribute>();
          AddAttribute < IOffsetAttribute>();
          AddAttribute < TypeAttribute>();
	  }
        public virtual void TestMatching()
        {
            SlowSynonymMap map = new SlowSynonymMap();

            bool orig = false;
            bool merge = true;
            map.Add(Strings("a b"), Tokens("ab"), orig, merge);
            map.Add(Strings("a c"), Tokens("ac"), orig, merge);
            map.Add(Strings("a"), Tokens("aa"), orig, merge);
            map.Add(Strings("b"), Tokens("bb"), orig, merge);
            map.Add(Strings("z x c v"), Tokens("zxcv"), orig, merge);
            map.Add(Strings("x c"), Tokens("xc"), orig, merge);

            AssertTokenizesTo(map, "$", new string[] { "$" });
            AssertTokenizesTo(map, "a", new string[] { "aa" });
            AssertTokenizesTo(map, "a $", new string[] { "aa", "$" });
            AssertTokenizesTo(map, "$ a", new string[] { "$", "aa" });
            AssertTokenizesTo(map, "a a", new string[] { "aa", "aa" });
            AssertTokenizesTo(map, "b", new string[] { "bb" });
            AssertTokenizesTo(map, "z x c v", new string[] { "zxcv" });
            AssertTokenizesTo(map, "z x c $", new string[] { "z", "xc", "$" });

            // repeats
            map.Add(Strings("a b"), Tokens("ab"), orig, merge);
            map.Add(Strings("a b"), Tokens("ab"), orig, merge);

            // FIXME: the below test intended to be { "ab" }
            AssertTokenizesTo(map, "a b", new string[] { "ab", "ab", "ab" });

            // check for lack of recursion
            map.Add(Strings("zoo"), Tokens("zoo"), orig, merge);
            AssertTokenizesTo(map, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "$", "zoo" });
            map.Add(Strings("zoo"), Tokens("zoo zoo"), orig, merge);
            // FIXME: the below test intended to be { "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo" }
            // maybe this was just a typo in the old test????
            AssertTokenizesTo(map, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" });
        }
Пример #5
0
	  private SlowSynonymMap match(SlowSynonymMap map)
	  {
		SlowSynonymMap result = null;

		if (map.submap != null)
		{
		  AttributeSource tok = nextTok();
		  if (tok != null)
		  {
			// clone ourselves.
			if (tok == this)
			{
			  tok = CloneAttributes();
			}
			// check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
            var termAtt = tok.GetAttribute < ICharTermAttribute>();
			SlowSynonymMap subMap = map.submap.Get(termAtt.buffer(), 0, termAtt.length());

			if (subMap != null)
			{
			  // recurse
			  result = match(subMap);
			}

			if (result != null)
			{
			  matched.AddFirst(tok);
			}
			else
			{
			  // push back unmatched token
			  pushTok(tok);
			}
		  }
		}

		// if no longer sequence matched, so if this node has synonyms, it's the match.
		if (result == null && map.synonyms != null)
		{
		  result = map;
		}

		return result;
	  }
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: static void parseRules(Iterable<String> rules, SlowSynonymMap map, String mappingSep, String synSep, boolean expansion, TokenizerFactory tokFactory) throws java.io.IOException
        internal static void parseRules(IEnumerable<string> rules, SlowSynonymMap map, string mappingSep, string synSep, bool expansion, TokenizerFactory tokFactory)
        {
            int count = 0;
            foreach (string rule in rules)
            {
              // To use regexes, we need an expression that specifies an odd number of chars.
              // This can't really be done with string.split(), and since we need to
              // do unescaping at some point anyway, we wouldn't be saving any effort
              // by using regexes.

              IList<string> mapping = splitSmart(rule, mappingSep, false);

              IList<IList<string>> source;
              IList<IList<string>> target;

              if (mapping.Count > 2)
              {
            throw new System.ArgumentException("Invalid Synonym Rule:" + rule);
              }
              else if (mapping.Count == 2)
              {
            source = getSynList(mapping[0], synSep, tokFactory);
            target = getSynList(mapping[1], synSep, tokFactory);
              }
              else
              {
            source = getSynList(mapping[0], synSep, tokFactory);
            if (expansion)
            {
              // expand to all arguments
              target = source;
            }
            else
            {
              // reduce to first argument
              target = new List<>(1);
              target.Add(source[0]);
            }
              }

              bool includeOrig = false;
              foreach (IList<string> fromToks in source)
              {
            count++;
            foreach (IList<string> toToks in target)
            {
              map.add(fromToks, SlowSynonymMap.makeTokens(toToks), includeOrig, true);
            }
              }
            }
        }
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void inform(ResourceLoader loader) throws java.io.IOException
        public void inform(ResourceLoader loader)
        {
            TokenizerFactory tokFactory = null;
            if (tf != null)
            {
              tokFactory = loadTokenizerFactory(loader, tf);
            }

            IEnumerable<string> wlist = loadRules(synonyms, loader);

            synMap = new SlowSynonymMap(ignoreCase);
            parseRules(wlist, synMap, "=>", ",", expand,tokFactory);
        }
 internal static void AssertTokenizesTo(SlowSynonymMap dict, IList<Token> input, string[] expected, int[] startOffsets, int[] endOffsets, int[] posIncs)
 {
     TokenStream tokenizer = new IterTokenStream(input);
     SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
     AssertTokenStreamContents(stream, expected, startOffsets, endOffsets, posIncs);
 }
 internal static void AssertTokenizesTo(SlowSynonymMap dict, string input, string[] expected, int[] posIncs)
 {
     Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
     SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
     AssertTokenStreamContents(stream, expected, posIncs);
 }
        public virtual void TestOffsetBug()
        {
            // With the following rules:
            // a a=>b
            // x=>y
            // analysing "a x" causes "y" to have a bad offset (end less than start)
            // SOLR-167
            SlowSynonymMap map = new SlowSynonymMap();

            bool orig = false;
            bool merge = true;

            map.Add(Strings("a a"), Tokens("b"), orig, merge);
            map.Add(Strings("x"), Tokens("y"), orig, merge);

            // "a a x" => "b y"
            AssertTokenizesTo(map, Tokens("a,1,0,1 a,1,2,3 x,1,4,5"), new string[] { "b", "y" }, new int[] { 0, 4 }, new int[] { 3, 5 }, new int[] { 1, 1 });
        }
        public virtual void TestPositionIncrementsWithOrig()
        {
            SlowSynonymMap map = new SlowSynonymMap();

            bool orig = true;
            bool merge = true;

            // test that generated tokens start at the same offset as the original
            map.Add(Strings("a"), Tokens("aa"), orig, merge);
            AssertTokenizesTo(map, Tokens("a,5"), new string[] { "a", "aa" }, new int[] { 5, 0 });
            AssertTokenizesTo(map, Tokens("b,1 a,0"), new string[] { "b", "a", "aa" }, new int[] { 1, 0, 0 });

            // test that offset of first replacement is ignored (always takes the orig offset)
            map.Add(Strings("b"), Tokens("bb,100"), orig, merge);
            AssertTokenizesTo(map, Tokens("b,5"), new string[] { "b", "bb" }, new int[] { 5, 0 });
            AssertTokenizesTo(map, Tokens("c,1 b,0"), new string[] { "c", "b", "bb" }, new int[] { 1, 0, 0 });

            // test that subsequent tokens are adjusted accordingly
            map.Add(Strings("c"), Tokens("cc,100 c2,2"), orig, merge);
            AssertTokenizesTo(map, Tokens("c,5"), new string[] { "c", "cc", "c2" }, new int[] { 5, 0, 2 });
            AssertTokenizesTo(map, Tokens("d,1 c,0"), new string[] { "d", "c", "cc", "c2" }, new int[] { 1, 0, 0, 2 });
        }
        public virtual void TestOverlap()
        {
            SlowSynonymMap map = new SlowSynonymMap();

            bool orig = false;
            bool merge = true;
            map.Add(Strings("qwe"), Tokens("qq/ww/ee"), orig, merge);
            map.Add(Strings("qwe"), Tokens("xx"), orig, merge);
            map.Add(Strings("qwe"), Tokens("yy"), orig, merge);
            map.Add(Strings("qwe"), Tokens("zz"), orig, merge);
            AssertTokenizesTo(map, "$", new string[] { "$" });
            AssertTokenizesTo(map, "qwe", new string[] { "qq", "ww", "ee", "xx", "yy", "zz" }, new int[] { 1, 0, 0, 0, 0, 0 });

            // test merging within the map

            map.Add(Strings("a"), Tokens("a5,5 a8,3 a10,2"), orig, merge);
            map.Add(Strings("a"), Tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge);
            AssertTokenizesTo(map, "a", new string[] { "a3", "a5", "a7", "a8", "a9", "a10", "a11", "a111" }, new int[] { 1, 2, 2, 1, 1, 1, 1, 100 });
        }
        public virtual void TestMapMerge()
        {
            SlowSynonymMap map = new SlowSynonymMap();

            bool orig = false;
            bool merge = true;
            map.Add(Strings("a"), Tokens("a5,5"), orig, merge);
            map.Add(Strings("a"), Tokens("a3,3"), orig, merge);

            AssertTokenizesTo(map, "a", new string[] { "a3", "a5" }, new int[] { 1, 2 });

            map.Add(Strings("b"), Tokens("b3,3"), orig, merge);
            map.Add(Strings("b"), Tokens("b5,5"), orig, merge);

            AssertTokenizesTo(map, "b", new string[] { "b3", "b5" }, new int[] { 1, 2 });

            map.Add(Strings("a"), Tokens("A3,3"), orig, merge);
            map.Add(Strings("a"), Tokens("A5,5"), orig, merge);

            AssertTokenizesTo(map, "a", new string[] { "a3", "A3", "a5", "A5" }, new int[] { 1, 0, 2, 0 });

            map.Add(Strings("a"), Tokens("a1"), orig, merge);
            AssertTokenizesTo(map, "a", new string[] { "a1", "a3", "A3", "a5", "A5" }, new int[] { 1, 2, 0, 2, 0 });

            map.Add(Strings("a"), Tokens("a2,2"), orig, merge);
            map.Add(Strings("a"), Tokens("a4,4 a6,2"), orig, merge);
            AssertTokenizesTo(map, "a", new string[] { "a1", "a2", "a3", "A3", "a4", "a5", "A5", "a6" }, new int[] { 1, 1, 1, 0, 1, 1, 0, 1 });
        }
        public virtual void TestIncludeOrig()
        {
            SlowSynonymMap map = new SlowSynonymMap();

            bool orig = true;
            bool merge = true;
            map.Add(Strings("a b"), Tokens("ab"), orig, merge);
            map.Add(Strings("a c"), Tokens("ac"), orig, merge);
            map.Add(Strings("a"), Tokens("aa"), orig, merge);
            map.Add(Strings("b"), Tokens("bb"), orig, merge);
            map.Add(Strings("z x c v"), Tokens("zxcv"), orig, merge);
            map.Add(Strings("x c"), Tokens("xc"), orig, merge);

            AssertTokenizesTo(map, "$", new string[] { "$" }, new int[] { 1 });
            AssertTokenizesTo(map, "a", new string[] { "a", "aa" }, new int[] { 1, 0 });
            AssertTokenizesTo(map, "a", new string[] { "a", "aa" }, new int[] { 1, 0 });
            AssertTokenizesTo(map, "$ a", new string[] { "$", "a", "aa" }, new int[] { 1, 1, 0 });
            AssertTokenizesTo(map, "a $", new string[] { "a", "aa", "$" }, new int[] { 1, 0, 1 });
            AssertTokenizesTo(map, "$ a !", new string[] { "$", "a", "aa", "!" }, new int[] { 1, 1, 0, 1 });
            AssertTokenizesTo(map, "a a", new string[] { "a", "aa", "a", "aa" }, new int[] { 1, 0, 1, 0 });
            AssertTokenizesTo(map, "b", new string[] { "b", "bb" }, new int[] { 1, 0 });
            AssertTokenizesTo(map, "z x c v", new string[] { "z", "zxcv", "x", "c", "v" }, new int[] { 1, 0, 1, 1, 1 });
            AssertTokenizesTo(map, "z x c $", new string[] { "z", "x", "xc", "c", "$" }, new int[] { 1, 1, 0, 1, 1 });

            // check for lack of recursion
            map.Add(Strings("zoo zoo"), Tokens("zoo"), orig, merge);
            // CHECKME: I think the previous test (with 4 zoo's), was just a typo.
            AssertTokenizesTo(map, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "zoo", "$", "zoo" }, new int[] { 1, 0, 1, 1, 1 });

            map.Add(Strings("zoo"), Tokens("zoo zoo"), orig, merge);
            AssertTokenizesTo(map, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }, new int[] { 1, 0, 1, 1, 1, 0, 1 });
        }