public virtual void TestIncludeOrig() { SlowSynonymMap map = new SlowSynonymMap(); bool orig = true; bool merge = true; map.Add(Strings("a b"), Tokens("ab"), orig, merge); map.Add(Strings("a c"), Tokens("ac"), orig, merge); map.Add(Strings("a"), Tokens("aa"), orig, merge); map.Add(Strings("b"), Tokens("bb"), orig, merge); map.Add(Strings("z x c v"), Tokens("zxcv"), orig, merge); map.Add(Strings("x c"), Tokens("xc"), orig, merge); AssertTokenizesTo(map, "$", new string[] { "$" }, new int[] { 1 }); AssertTokenizesTo(map, "a", new string[] { "a", "aa" }, new int[] { 1, 0 }); AssertTokenizesTo(map, "a", new string[] { "a", "aa" }, new int[] { 1, 0 }); AssertTokenizesTo(map, "$ a", new string[] { "$", "a", "aa" }, new int[] { 1, 1, 0 }); AssertTokenizesTo(map, "a $", new string[] { "a", "aa", "$" }, new int[] { 1, 0, 1 }); AssertTokenizesTo(map, "$ a !", new string[] { "$", "a", "aa", "!" }, new int[] { 1, 1, 0, 1 }); AssertTokenizesTo(map, "a a", new string[] { "a", "aa", "a", "aa" }, new int[] { 1, 0, 1, 0 }); AssertTokenizesTo(map, "b", new string[] { "b", "bb" }, new int[] { 1, 0 }); AssertTokenizesTo(map, "z x c v", new string[] { "z", "zxcv", "x", "c", "v" }, new int[] { 1, 0, 1, 1, 1 }); AssertTokenizesTo(map, "z x c $", new string[] { "z", "x", "xc", "c", "$" }, new int[] { 1, 1, 0, 1, 1 }); // check for lack of recursion map.Add(Strings("zoo zoo"), Tokens("zoo"), orig, merge); // CHECKME: I think the previous test (with 4 zoo's), was just a typo. AssertTokenizesTo(map, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "zoo", "$", "zoo" }, new int[] { 1, 0, 1, 1, 1 }); map.Add(Strings("zoo"), Tokens("zoo zoo"), orig, merge); AssertTokenizesTo(map, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }, new int[] { 1, 0, 1, 1, 1, 0, 1 }); }
public virtual void TestBigramTokenizer() { SlowSynonymMap synMap; // prepare bi-gram tokenizer factory IDictionary <string, string> args = new Dictionary <string, string>(); args[AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM] = "4.4"; args["minGramSize"] = "2"; args["maxGramSize"] = "2"; TokenizerFactory tf = new NGramTokenizerFactory(args); // (ab)->(bc)->(cd)->[ef][fg][gh] IList <string> rules = new List <string>(); rules.Add("abcd=>efgh"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, tf); assertEquals(1, synMap.Submap.size()); assertEquals(1, GetSubSynonymMap(synMap, "ab").Submap.size()); assertEquals(1, GetSubSynonymMap(GetSubSynonymMap(synMap, "ab"), "bc").Submap.size()); AssertTokIncludes(GetSubSynonymMap(GetSubSynonymMap(synMap, "ab"), "bc"), "cd", "ef"); AssertTokIncludes(GetSubSynonymMap(GetSubSynonymMap(synMap, "ab"), "bc"), "cd", "fg"); AssertTokIncludes(GetSubSynonymMap(GetSubSynonymMap(synMap, "ab"), "bc"), "cd", "gh"); }
public virtual void TestMapMerge() { SlowSynonymMap map = new SlowSynonymMap(); bool orig = false; bool merge = true; map.Add(Strings("a"), Tokens("a5,5"), orig, merge); map.Add(Strings("a"), Tokens("a3,3"), orig, merge); AssertTokenizesTo(map, "a", new string[] { "a3", "a5" }, new int[] { 1, 2 }); map.Add(Strings("b"), Tokens("b3,3"), orig, merge); map.Add(Strings("b"), Tokens("b5,5"), orig, merge); AssertTokenizesTo(map, "b", new string[] { "b3", "b5" }, new int[] { 1, 2 }); map.Add(Strings("a"), Tokens("A3,3"), orig, merge); map.Add(Strings("a"), Tokens("A5,5"), orig, merge); AssertTokenizesTo(map, "a", new string[] { "a3", "A3", "a5", "A5" }, new int[] { 1, 0, 2, 0 }); map.Add(Strings("a"), Tokens("a1"), orig, merge); AssertTokenizesTo(map, "a", new string[] { "a1", "a3", "A3", "a5", "A5" }, new int[] { 1, 2, 0, 2, 0 }); map.Add(Strings("a"), Tokens("a2,2"), orig, merge); map.Add(Strings("a"), Tokens("a4,4 a6,2"), orig, merge); AssertTokenizesTo(map, "a", new string[] { "a1", "a2", "a3", "A3", "a4", "a5", "A5", "a6" }, new int[] { 1, 1, 1, 0, 1, 1, 0, 1 }); }
internal static void AssertTokenizesTo(SlowSynonymMap dict, string input, string[] expected, int[] posIncs) { Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict); AssertTokenStreamContents(stream, expected, posIncs); }
internal static void AssertTokenizesTo(SlowSynonymMap dict, IList <Token> input, string[] expected, int[] startOffsets, int[] endOffsets, int[] posIncs) { TokenStream tokenizer = new IterTokenStream(input); SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict); AssertTokenStreamContents(stream, expected, startOffsets, endOffsets, posIncs); }
/// <param name="singleMatch"> List<String>, the sequence of strings to match </param> /// <param name="replacement"> List<Token> the list of tokens to use on a match </param> /// <param name="includeOrig"> sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens </param> /// <param name="mergeExisting"> merge the replacement tokens with any other mappings that exist </param> public virtual void Add(IList<string> singleMatch, IList<Token> replacement, bool includeOrig, bool mergeExisting) { var currMap = this; foreach (string str in singleMatch) { if (currMap.submap == null) { // for now hardcode at 4.0, as its what the old code did. // would be nice to fix, but shouldn't store a version in each submap!!! currMap.submap = new CharArrayMap<SlowSynonymMap>(Lucene.Net.Util.Version.LUCENE_CURRENT, 1, IgnoreCase()); } var map = currMap.submap.Get(str); if (map == null) { map = new SlowSynonymMap(); map.flags |= flags & IGNORE_CASE; currMap.submap.put(str, map); } currMap = map; } if (currMap.synonyms != null && !mergeExisting) { throw new System.ArgumentException("SynonymFilter: there is already a mapping for " + singleMatch); } IList<Token> superset = currMap.synonyms == null ? replacement : MergeTokens(currMap.synonyms, replacement); currMap.synonyms = superset.ToArray(); if (includeOrig) { currMap.flags |= INCLUDE_ORIG; } }
public virtual void TestMultiWordSynonymsOld() { IList<string> rules = new JCG.List<string>(); rules.Add("a b c,d"); SlowSynonymMap synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); SlowSynonymFilter ts = new SlowSynonymFilter(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false), synMap); // This fails because ["e","e"] is the value of the token stream AssertTokenStreamContents(ts, new string[] { "a", "e" }); }
internal static void ParseRules(IEnumerable <string> rules, SlowSynonymMap map, string mappingSep, string synSep, bool expansion, TokenizerFactory tokFactory) { int count = 0; foreach (string rule in rules) { // To use regexes, we need an expression that specifies an odd number of chars. // This can't really be done with string.split(), and since we need to // do unescaping at some point anyway, we wouldn't be saving any effort // by using regexes. IList <string> mapping = SplitSmart(rule, mappingSep, false); IList <IList <string> > source; IList <IList <string> > target; if (mapping.Count > 2) { throw new ArgumentException("Invalid Synonym Rule:" + rule); } else if (mapping.Count == 2) { source = GetSynList(mapping[0], synSep, tokFactory); target = GetSynList(mapping[1], synSep, tokFactory); } else { source = GetSynList(mapping[0], synSep, tokFactory); if (expansion) { // expand to all arguments target = source; } else { // reduce to first argument target = new List <IList <string> >(1) { source[0] }; } } bool includeOrig = false; foreach (IList <string> fromToks in source) { count++; foreach (IList <string> toToks in target) { map.Add(fromToks, SlowSynonymMap.MakeTokens(toToks), includeOrig, true); } } } }
public virtual void TestMultiWordSynonymsOld() { IList<string> rules = new List<string>(); rules.Add("a b c,d"); SlowSynonymMap synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); SlowSynonymFilter ts = new SlowSynonymFilter(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false), synMap); // This fails because ["e","e"] is the value of the token stream AssertTokenStreamContents(ts, new string[] { "a", "e" }); }
public virtual void TestRead1waySynonymRules() { SlowSynonymMap synMap; // (a)->[a] // (b)->[a] IList <string> rules = new List <string>(); rules.Add("a,b"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", false, null); assertEquals(2, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a"); AssertTokIncludes(synMap, "b", "a"); // (a)->[a] // (b)->[a] // (c)->[a] rules.Clear(); rules.Add("a,b,c"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", false, null); assertEquals(3, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a"); AssertTokIncludes(synMap, "b", "a"); AssertTokIncludes(synMap, "c", "a"); // (a)->[a] // (b1)->(b2)->[a] rules.Clear(); rules.Add("a,b1 b2"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", false, null); assertEquals(2, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a"); assertEquals(1, GetSubSynonymMap(synMap, "b1").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "b1"), "b2", "a"); // (a1)->(a2)->[a1][a2] // (b)->[a1][a2] rules.Clear(); rules.Add("a1 a2,b"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", false, null); assertEquals(2, synMap.Submap.size()); assertEquals(1, GetSubSynonymMap(synMap, "a1").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "a1"), "a2", "a1"); AssertTokIncludes(GetSubSynonymMap(synMap, "a1"), "a2", "a2"); AssertTokIncludes(synMap, "b", "a1"); AssertTokIncludes(synMap, "b", "a2"); }
private void AssertTokIncludes(SlowSynonymMap map, string src, string exp) { Token[] tokens = map.Submap.Get(src).Synonyms; bool inc = false; foreach (Token token in tokens) { if (exp.Equals(new string(token.Buffer, 0, token.Length), StringComparison.Ordinal)) { inc = true; } } assertTrue(inc); }
public void Inform(IResourceLoader loader) { TokenizerFactory tokFactory = null; if (tf != null) { tokFactory = LoadTokenizerFactory(loader, tf); } IEnumerable <string> wlist = LoadRules(synonyms, loader); synMap = new SlowSynonymMap(ignoreCase); ParseRules(wlist, synMap, "=>", ",", expand, tokFactory); }
private IEnumerator<AttributeSource> replacement; // iterator over generated tokens public SlowSynonymFilter(TokenStream @in, SlowSynonymMap map) : base(@in) { if (map == null) { throw new System.ArgumentException("map is required", "map"); } this.map = map; // just ensuring these attributes exist... AddAttribute<ICharTermAttribute>(); AddAttribute < IPositionIncrementAttribute>(); AddAttribute < IOffsetAttribute>(); AddAttribute < TypeAttribute>(); }
private IEnumerator <AttributeSource> replacement; // iterator over generated tokens public SlowSynonymFilter(TokenStream @in, SlowSynonymMap map) : base(@in) { if (map == null) { throw new System.ArgumentException("map is required", "map"); } this.map = map; // just ensuring these attributes exist... AddAttribute <ICharTermAttribute>(); AddAttribute <IPositionIncrementAttribute>(); AddAttribute <IOffsetAttribute>(); AddAttribute <TypeAttribute>(); }
public virtual void TestInvalidMappingRules() { SlowSynonymMap synMap = new SlowSynonymMap(true); IList <string> rules = new List <string>(1); rules.Add("a=>b=>c"); try { SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); fail("IllegalArgumentException must be thrown."); } catch (ArgumentException) { } }
public virtual void TestLoadRules() { IDictionary <string, string> args = new Dictionary <string, string>(); args["synonyms"] = "something.txt"; SlowSynonymFilterFactory ff = new SlowSynonymFilterFactory(args); ff.Inform(new ResourceLoaderAnonymousInnerClassHelper()); SlowSynonymMap synMap = ff.SynonymMap; assertEquals(2, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a"); AssertTokIncludes(synMap, "a", "b"); AssertTokIncludes(synMap, "b", "a"); AssertTokIncludes(synMap, "b", "b"); }
public virtual void TestOffsetBug() { // With the following rules: // a a=>b // x=>y // analysing "a x" causes "y" to have a bad offset (end less than start) // SOLR-167 SlowSynonymMap map = new SlowSynonymMap(); bool orig = false; bool merge = true; map.Add(Strings("a a"), Tokens("b"), orig, merge); map.Add(Strings("x"), Tokens("y"), orig, merge); // "a a x" => "b y" AssertTokenizesTo(map, Tokens("a,1,0,1 a,1,2,3 x,1,4,5"), new string[] { "b", "y" }, new int[] { 0, 4 }, new int[] { 3, 5 }, new int[] { 1, 1 }); }
private SlowSynonymMap Match(SlowSynonymMap map) { SlowSynonymMap result = null; if (map.Submap != null) { AttributeSource tok = NextTok(); if (tok != null) { // clone ourselves. if (tok == this) { tok = CloneAttributes(); } // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level? var termAtt = tok.GetAttribute <ICharTermAttribute>(); SlowSynonymMap subMap = map.Submap.Get(termAtt.Buffer, 0, termAtt.Length); if (subMap != null) { // recurse result = Match(subMap); } if (result != null) { matched.AddFirst(tok); } else { // push back unmatched token PushTok(tok); } } } // if no longer sequence matched, so if this node has synonyms, it's the match. if (result == null && map.Synonyms != null) { result = map; } return(result); }
public virtual void TestOverlap() { SlowSynonymMap map = new SlowSynonymMap(); bool orig = false; bool merge = true; map.Add(Strings("qwe"), Tokens("qq/ww/ee"), orig, merge); map.Add(Strings("qwe"), Tokens("xx"), orig, merge); map.Add(Strings("qwe"), Tokens("yy"), orig, merge); map.Add(Strings("qwe"), Tokens("zz"), orig, merge); AssertTokenizesTo(map, "$", new string[] { "$" }); AssertTokenizesTo(map, "qwe", new string[] { "qq", "ww", "ee", "xx", "yy", "zz" }, new int[] { 1, 0, 0, 0, 0, 0 }); // test merging within the map map.Add(Strings("a"), Tokens("a5,5 a8,3 a10,2"), orig, merge); map.Add(Strings("a"), Tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge); AssertTokenizesTo(map, "a", new string[] { "a3", "a5", "a7", "a8", "a9", "a10", "a11", "a111" }, new int[] { 1, 2, 2, 1, 1, 1, 1, 100 }); }
public virtual void TestPositionIncrementsWithOrig() { SlowSynonymMap map = new SlowSynonymMap(); bool orig = true; bool merge = true; // test that generated tokens start at the same offset as the original map.Add(Strings("a"), Tokens("aa"), orig, merge); AssertTokenizesTo(map, Tokens("a,5"), new string[] { "a", "aa" }, new int[] { 5, 0 }); AssertTokenizesTo(map, Tokens("b,1 a,0"), new string[] { "b", "a", "aa" }, new int[] { 1, 0, 0 }); // test that offset of first replacement is ignored (always takes the orig offset) map.Add(Strings("b"), Tokens("bb,100"), orig, merge); AssertTokenizesTo(map, Tokens("b,5"), new string[] { "b", "bb" }, new int[] { 5, 0 }); AssertTokenizesTo(map, Tokens("c,1 b,0"), new string[] { "c", "b", "bb" }, new int[] { 1, 0, 0 }); // test that subsequent tokens are adjusted accordingly map.Add(Strings("c"), Tokens("cc,100 c2,2"), orig, merge); AssertTokenizesTo(map, Tokens("c,5"), new string[] { "c", "cc", "c2" }, new int[] { 5, 0, 2 }); AssertTokenizesTo(map, Tokens("d,1 c,0"), new string[] { "d", "c", "cc", "c2" }, new int[] { 1, 0, 0, 2 }); }
public virtual void TestMatching() { SlowSynonymMap map = new SlowSynonymMap(); bool orig = false; bool merge = true; map.Add(Strings("a b"), Tokens("ab"), orig, merge); map.Add(Strings("a c"), Tokens("ac"), orig, merge); map.Add(Strings("a"), Tokens("aa"), orig, merge); map.Add(Strings("b"), Tokens("bb"), orig, merge); map.Add(Strings("z x c v"), Tokens("zxcv"), orig, merge); map.Add(Strings("x c"), Tokens("xc"), orig, merge); AssertTokenizesTo(map, "$", new string[] { "$" }); AssertTokenizesTo(map, "a", new string[] { "aa" }); AssertTokenizesTo(map, "a $", new string[] { "aa", "$" }); AssertTokenizesTo(map, "$ a", new string[] { "$", "aa" }); AssertTokenizesTo(map, "a a", new string[] { "aa", "aa" }); AssertTokenizesTo(map, "b", new string[] { "bb" }); AssertTokenizesTo(map, "z x c v", new string[] { "zxcv" }); AssertTokenizesTo(map, "z x c $", new string[] { "z", "xc", "$" }); // repeats map.Add(Strings("a b"), Tokens("ab"), orig, merge); map.Add(Strings("a b"), Tokens("ab"), orig, merge); // FIXME: the below test intended to be { "ab" } AssertTokenizesTo(map, "a b", new string[] { "ab", "ab", "ab" }); // check for lack of recursion map.Add(Strings("zoo"), Tokens("zoo"), orig, merge); AssertTokenizesTo(map, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "$", "zoo" }); map.Add(Strings("zoo"), Tokens("zoo zoo"), orig, merge); // FIXME: the below test intended to be { "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo" } // maybe this was just a typo in the old test???? AssertTokenizesTo(map, "zoo zoo $ zoo", new string[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }); }
/// <param name="singleMatch"> List<String>, the sequence of strings to match </param> /// <param name="replacement"> List<Token> the list of tokens to use on a match </param> /// <param name="includeOrig"> sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens </param> /// <param name="mergeExisting"> merge the replacement tokens with any other mappings that exist </param> public virtual void add(IList <string> singleMatch, IList <Token> replacement, bool includeOrig, bool mergeExisting) { SlowSynonymMap currMap = this; foreach (string str in singleMatch) { if (currMap.submap == null) { // for now hardcode at 4.0, as its what the old code did. // would be nice to fix, but shouldn't store a version in each submap!!! currMap.submap = new CharArrayMap <>(Version.LUCENE_CURRENT, 1, ignoreCase()); } SlowSynonymMap map = currMap.submap.get(str); if (map == null) { map = new SlowSynonymMap(); map.flags |= flags & IGNORE_CASE; currMap.submap.put(str, map); } currMap = map; } if (currMap.synonyms != null && !mergeExisting) { throw new System.ArgumentException("SynonymFilter: there is already a mapping for " + singleMatch); } IList <Token> superset = currMap.synonyms == null ? replacement : mergeTokens(currMap.synonyms, replacement); currMap.synonyms = superset.ToArray(); if (includeOrig_Renamed) { currMap.flags |= INCLUDE_ORIG; } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: static void parseRules(Iterable<String> rules, SlowSynonymMap map, String mappingSep, String synSep, boolean expansion, TokenizerFactory tokFactory) throws java.io.IOException internal static void parseRules(IEnumerable<string> rules, SlowSynonymMap map, string mappingSep, string synSep, bool expansion, TokenizerFactory tokFactory) { int count = 0; foreach (string rule in rules) { // To use regexes, we need an expression that specifies an odd number of chars. // This can't really be done with string.split(), and since we need to // do unescaping at some point anyway, we wouldn't be saving any effort // by using regexes. IList<string> mapping = splitSmart(rule, mappingSep, false); IList<IList<string>> source; IList<IList<string>> target; if (mapping.Count > 2) { throw new System.ArgumentException("Invalid Synonym Rule:" + rule); } else if (mapping.Count == 2) { source = getSynList(mapping[0], synSep, tokFactory); target = getSynList(mapping[1], synSep, tokFactory); } else { source = getSynList(mapping[0], synSep, tokFactory); if (expansion) { // expand to all arguments target = source; } else { // reduce to first argument target = new List<>(1); target.Add(source[0]); } } bool includeOrig = false; foreach (IList<string> fromToks in source) { count++; foreach (IList<string> toToks in target) { map.add(fromToks, SlowSynonymMap.makeTokens(toToks), includeOrig, true); } } } }
internal static void AssertTokenizesTo(SlowSynonymMap dict, IList<Token> input, string[] expected, int[] startOffsets, int[] endOffsets, int[] posIncs) { TokenStream tokenizer = new IterTokenStream(input); SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict); AssertTokenStreamContents(stream, expected, startOffsets, endOffsets, posIncs); }
private SlowSynonymMap GetSubSynonymMap(SlowSynonymMap map, string src) { return(map.Submap.Get(src)); }
public virtual void TestReadMappingRules() { SlowSynonymMap synMap; // (a)->[b] IList <string> rules = new List <string>(); rules.Add("a=>b"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(1, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "b"); // (a)->[c] // (b)->[c] rules.Clear(); rules.Add("a,b=>c"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(2, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "c"); AssertTokIncludes(synMap, "b", "c"); // (a)->[b][c] rules.Clear(); rules.Add("a=>b,c"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(1, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "b"); AssertTokIncludes(synMap, "a", "c"); // (a)->(b)->[a2] // [a1] rules.Clear(); rules.Add("a=>a1"); rules.Add("a b=>a2"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(1, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a1"); assertEquals(1, GetSubSynonymMap(synMap, "a").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "a"), "b", "a2"); // (a)->(b)->[a2] // (c)->[a3] // [a1] rules.Clear(); rules.Add("a=>a1"); rules.Add("a b=>a2"); rules.Add("a c=>a3"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(1, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a1"); assertEquals(2, GetSubSynonymMap(synMap, "a").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "a"), "b", "a2"); AssertTokIncludes(GetSubSynonymMap(synMap, "a"), "c", "a3"); // (a)->(b)->[a2] // [a1] // (b)->(c)->[b2] // [b1] rules.Clear(); rules.Add("a=>a1"); rules.Add("a b=>a2"); rules.Add("b=>b1"); rules.Add("b c=>b2"); synMap = new SlowSynonymMap(true); SlowSynonymFilterFactory.ParseRules(rules, synMap, "=>", ",", true, null); assertEquals(2, synMap.Submap.size()); AssertTokIncludes(synMap, "a", "a1"); assertEquals(1, GetSubSynonymMap(synMap, "a").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "a"), "b", "a2"); AssertTokIncludes(synMap, "b", "b1"); assertEquals(1, GetSubSynonymMap(synMap, "b").Submap.size()); AssertTokIncludes(GetSubSynonymMap(synMap, "b"), "c", "b2"); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void inform(ResourceLoader loader) throws java.io.IOException public void inform(ResourceLoader loader) { TokenizerFactory tokFactory = null; if (tf != null) { tokFactory = loadTokenizerFactory(loader, tf); } IEnumerable<string> wlist = loadRules(synonyms, loader); synMap = new SlowSynonymMap(ignoreCase); parseRules(wlist, synMap, "=>", ",", expand,tokFactory); }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ public override bool IncrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.MoveNext()) { Copy(this, replacement.Current); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = NextTok(); if (firstTok == null) { return(false); } var termAtt = firstTok.AddAttribute <ICharTermAttribute>(); SlowSynonymMap result = map.Submap != null?map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null; if (result == null) { Copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = CloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <AttributeSource>(); result = Match(result); if (result == null) { // no match, simply return the first token read. Copy(this, firstTok); return(true); } // reuse, or create new one each time? IList <AttributeSource> generated = new JCG.List <AttributeSource>(result.Synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.IncludeOrig; AttributeSource origTok = includeOrig ? firstTok : null; IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>(); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.Synonyms.Length; i++) { Token repTok = result.Synonyms[i]; AttributeSource newTok = firstTok.CloneAttributes(); ICharTermAttribute newTermAtt = newTok.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset); newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; //origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.Remove(origTok); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
private SlowSynonymMap match(SlowSynonymMap map) { SlowSynonymMap result = null; if (map.submap != null) { AttributeSource tok = nextTok(); if (tok != null) { // clone ourselves. if (tok == this) { tok = CloneAttributes(); } // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level? var termAtt = tok.GetAttribute < ICharTermAttribute>(); SlowSynonymMap subMap = map.submap.Get(termAtt.buffer(), 0, termAtt.length()); if (subMap != null) { // recurse result = match(subMap); } if (result != null) { matched.AddFirst(tok); } else { // push back unmatched token pushTok(tok); } } } // if no longer sequence matched, so if this node has synonyms, it's the match. if (result == null && map.synonyms != null) { result = map; } return result; }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops: if (replacement != null && replacement.hasNext()) { //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops: copy(this, replacement.next()); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = nextTok(); if (firstTok == null) { return(false); } CharTermAttribute termAtt = firstTok.addAttribute(typeof(CharTermAttribute)); SlowSynonymMap result = map.submap != null?map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null; if (result == null) { copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = cloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <>(); result = match(result); if (result == null) { // no match, simply return the first token read. copy(this, firstTok); return(true); } // reuse, or create new one each time? List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.includeOrig(); AttributeSource origTok = includeOrig ? firstTok : null; PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute)); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.synonyms.Length; i++) { Token repTok = result.synonyms[i]; AttributeSource newTok = firstTok.cloneAttributes(); CharTermAttribute newTermAtt = newTok.addAttribute(typeof(CharTermAttribute)); OffsetAttribute newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute)); PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute)); OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute)); newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset()); newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length()); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }