Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap @lucene.experimental
// TODO: this should use inputstreams from the loader, not File! //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException public virtual void inform(ResourceLoader loader) { if (mapping != null) { IList <string> wlist = null; File mappingFile = new File(mapping); if (mappingFile.exists()) { wlist = getLines(loader, mapping); } else { IList <string> files = splitFileNames(mapping); wlist = new List <>(); foreach (string file in files) { IList <string> lines = getLines(loader, file.Trim()); wlist.AddRange(lines); } } NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); parseRules(wlist, builder); normMap = builder.build(); if (normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null normMap = null; } } }
// TODO: this should use inputstreams from the loader, not File! public virtual void Inform(IResourceLoader loader) { if (mapping != null) { IList<string> wlist = null; if (File.Exists(mapping)) { wlist = new List<string>(GetLines(loader, mapping)); } else { var files = SplitFileNames(mapping); wlist = new List<string>(); foreach (string file in files) { var lines = GetLines(loader, file.Trim()); wlist.AddRange(lines); } } NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); ParseRules(wlist, builder); normMap = builder.Build(); if (normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null normMap = null; } } }
// TODO: this should use inputstreams from the loader, not File! public virtual void Inform(IResourceLoader loader) { if (mapping != null) { IList <string> wlist = null; if (File.Exists(mapping)) { wlist = new List <string>(GetLines(loader, mapping)); } else { var files = SplitFileNames(mapping); wlist = new List <string>(); foreach (string file in files) { var lines = GetLines(loader, file.Trim()); wlist.AddRange(lines); } } NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); ParseRules(wlist, builder); m_normMap = builder.Build(); if (m_normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null m_normMap = null; } } }
protected virtual void ParseRules(IList <string> rules, NormalizeCharMap.Builder builder) { foreach (string rule in rules) { Match m = p.Match(rule); if (!m.Success) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping); } builder.Add(ParseString(m.Groups[1].Value), ParseString(m.Groups[2].Value)); } }
protected internal virtual void parseRules(IList <string> rules, NormalizeCharMap.Builder builder) { foreach (string rule in rules) { Matcher m = p.matcher(rule); if (!m.find()) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping); } builder.add(parseString(m.group(1)), parseString(m.group(2))); } }
public virtual void TestFinalOffsetSpecialCase() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("t", ""); // even though this below rule has no effect, the test passes if you remove it!! builder.Add("tmakdbl", "c"); NormalizeCharMap map = builder.Build(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this, map); string text = "gzw f quaxot"; CheckAnalysisConsistency(Random(), analyzer, false, text); }
public virtual void Test() { CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("mtqlpi", ""); builder.Add("mwoknt", "jjp"); builder.Add("tcgyreo", "zpfpajyws"); NormalizeCharMap map = builder.Build(); Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, cas, map); CheckAnalysisConsistency(Random(), a, false, "wmgddzunizdomqyj"); }
public virtual void TestFinalOffsetSpecialCase() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("t", ""); // even though this below rule has no effect, the test passes if you remove it!! builder.Add("tmakdbl", "c"); NormalizeCharMap map = builder.Build(); Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return(new TokenStreamComponents(tokenizer, tokenizer)); }, initReader: (fieldName, reader) => new MappingCharFilter(map, reader)); string text = "gzw f quaxot"; CheckAnalysisConsistency(Random, analyzer, false, text); }
public virtual void TestOffsetCorrection() { const string INPUT = "Günther Günther is here"; // create MappingCharFilter IList<string> mappingRules = new List<string>(); mappingRules.Add("\"ü\" => \"ü\""); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("ü", "ü"); NormalizeCharMap normMap = builder.Build(); CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); // create PatternTokenizer TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length); charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); stream = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length); }
private NormalizeCharMap RandomMap() { Random random = Random(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // we can't add duplicate keys, or NormalizeCharMap gets angry ISet <string> keys = new HashSet <string>(); int num = random.Next(5); //System.out.println("NormalizeCharMap="); for (int i = 0; i < num; i++) { string key = TestUtil.RandomSimpleString(random); if (!keys.Contains(key) && key.Length != 0) { string value = TestUtil.RandomSimpleString(random); builder.Add(key, value); keys.Add(key); //System.out.println("mapping: '" + key + "' => '" + value + "'"); } } return(builder.Build()); }
public override void SetUp() { base.SetUp(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("aa", "a"); builder.Add("bbb", "b"); builder.Add("cccc", "cc"); builder.Add("h", "i"); builder.Add("j", "jj"); builder.Add("k", "kkk"); builder.Add("ll", "llll"); builder.Add("empty", ""); // BMP (surrogate pair): builder.Add(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1), "fclef"); builder.Add("\uff01", "full-width-exclamation"); normMap = builder.Build(); }
public virtual void TestNormalizeWinDelimToLinuxDelim() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("\\", "/"); NormalizeCharMap normMap = builder.Build(); string path = "c:\\a\\b\\c"; Reader cs = new MappingCharFilter(normMap, new StringReader(path)); PathHierarchyTokenizer t = new PathHierarchyTokenizer(cs); AssertTokenStreamContents(t, new string[] { "c:", "c:/a", "c:/a/b", "c:/a/b/c" }, new int[] { 0, 0, 0, 0 }, new int[] { 2, 4, 6, 8 }, new int[] { 1, 0, 0, 0 }, path.Length); }
private NormalizeCharMap RandomMap() { Random random = Random(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // we can't add duplicate keys, or NormalizeCharMap gets angry ISet<string> keys = new HashSet<string>(); int num = random.Next(5); //System.out.println("NormalizeCharMap="); for (int i = 0; i < num; i++) { string key = TestUtil.RandomSimpleString(random); if (!keys.Contains(key) && key.Length != 0) { string value = TestUtil.RandomSimpleString(random); builder.Add(key, value); keys.Add(key); //System.out.println("mapping: '" + key + "' => '" + value + "'"); } } return builder.Build(); }
public virtual void TestInvalidOffsets() { CharArraySet dict = makeDictionary("fall"); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("ü", "ue"); NormalizeCharMap normMap = builder.Build(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, dict, normMap); AssertAnalyzesTo(analyzer, "banküberfall", new string[] { "bankueberfall", "fall" }, new int[] { 0, 0 }, new int[] { 12, 12 }); }
public virtual void TestRandomMaps2() { Random random = Random(); int numIterations = AtLeast(3); for (int iter = 0; iter < numIterations; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST iter=" + iter); } char endLetter = (char)TestUtil.NextInt(random, 'b', 'z'); IDictionary<string, string> map = new Dictionary<string, string>(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); int numMappings = AtLeast(5); if (VERBOSE) { Console.WriteLine(" mappings:"); } while (map.Count < numMappings) { string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7); if (key.Length != 0 && !map.ContainsKey(key)) { string value = TestUtil.RandomSimpleString(random); map[key] = value; builder.Add(key, value); if (VERBOSE) { Console.WriteLine(" " + key + " -> " + value); } } } NormalizeCharMap charMap = builder.Build(); if (VERBOSE) { Console.WriteLine(" test random documents..."); } for (int iter2 = 0; iter2 < 100; iter2++) { string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000)); if (VERBOSE) { Console.WriteLine(" content=" + content); } // Do stupid dog-slow mapping: // Output string: StringBuilder output = new StringBuilder(); // Maps output offset to input offset: IList<int?> inputOffsets = new List<int?>(); int cumDiff = 0; int charIdx = 0; while (charIdx < content.Length) { int matchLen = -1; string matchRepl = null; foreach (KeyValuePair<string, string> ent in map) { string match = ent.Key; if (charIdx + match.Length <= content.Length) { int limit = charIdx + match.Length; bool matches = true; for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++) { if (match[charIdx2 - charIdx] != content[charIdx2]) { matches = false; break; } } if (matches) { string repl = ent.Value; if (match.Length > matchLen) { // Greedy: longer match wins matchLen = match.Length; matchRepl = repl; } } } } if (matchLen != -1) { // We found a match here! if (VERBOSE) { Console.WriteLine(" match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl); } output.Append(matchRepl); int minLen = Math.Min(matchLen, matchRepl.Length); // Common part, directly maps back to input // offset: for (int outIdx = 0; outIdx < minLen; outIdx++) { inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff); } cumDiff += matchLen - matchRepl.Length; charIdx += matchLen; if (matchRepl.Length < matchLen) { // Replacement string is shorter than matched // input: nothing to do } else if (matchRepl.Length > matchLen) { // Replacement string is longer than matched // input: for all the "extra" chars we map // back to a single input offset: for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++) { inputOffsets.Add(output.Length + cumDiff - 1); } } else { // Same length: no change to offset } Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length); } else { inputOffsets.Add(output.Length + cumDiff); output.Append(content[charIdx]); charIdx++; } } string expected = output.ToString(); if (VERBOSE) { Console.Write(" expected:"); for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++) { Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]); } Console.WriteLine(); } MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content)); StringBuilder actualBuilder = new StringBuilder(); IList<int?> actualInputOffsets = new List<int?>(); // Now consume the actual mapFilter, somewhat randomly: while (true) { if (random.Next(0, 1) == 1) { int ch = mapFilter.Read(); if (ch == -1) { break; } actualBuilder.Append((char)ch); } else { char[] buffer = new char[TestUtil.NextInt(random, 1, 100)]; int off = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1); int count = mapFilter.Read(buffer, off, buffer.Length - off); if (count == -1) { break; } else { actualBuilder.Append(buffer, off, count); } } if (random.Next(10) == 7) { // Map offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } } } // Finish mappping offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } string actual = actualBuilder.ToString(); // Verify: assertEquals(expected, actual); assertEquals(inputOffsets, actualInputOffsets); } } }
public virtual void TestRandomMaps2() { Random random = Random(); int numIterations = AtLeast(3); for (int iter = 0; iter < numIterations; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST iter=" + iter); } char endLetter = (char)TestUtil.NextInt(random, 'b', 'z'); IDictionary <string, string> map = new Dictionary <string, string>(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); int numMappings = AtLeast(5); if (VERBOSE) { Console.WriteLine(" mappings:"); } while (map.Count < numMappings) { string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7); if (key.Length != 0 && !map.ContainsKey(key)) { string value = TestUtil.RandomSimpleString(random); map[key] = value; builder.Add(key, value); if (VERBOSE) { Console.WriteLine(" " + key + " -> " + value); } } } NormalizeCharMap charMap = builder.Build(); if (VERBOSE) { Console.WriteLine(" test random documents..."); } for (int iter2 = 0; iter2 < 100; iter2++) { string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000)); if (VERBOSE) { Console.WriteLine(" content=" + content); } // Do stupid dog-slow mapping: // Output string: StringBuilder output = new StringBuilder(); // Maps output offset to input offset: IList <int?> inputOffsets = new List <int?>(); int cumDiff = 0; int charIdx = 0; while (charIdx < content.Length) { int matchLen = -1; string matchRepl = null; foreach (KeyValuePair <string, string> ent in map) { string match = ent.Key; if (charIdx + match.Length <= content.Length) { int limit = charIdx + match.Length; bool matches = true; for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++) { if (match[charIdx2 - charIdx] != content[charIdx2]) { matches = false; break; } } if (matches) { string repl = ent.Value; if (match.Length > matchLen) { // Greedy: longer match wins matchLen = match.Length; matchRepl = repl; } } } } if (matchLen != -1) { // We found a match here! if (VERBOSE) { Console.WriteLine(" match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl); } output.Append(matchRepl); int minLen = Math.Min(matchLen, matchRepl.Length); // Common part, directly maps back to input // offset: for (int outIdx = 0; outIdx < minLen; outIdx++) { inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff); } cumDiff += matchLen - matchRepl.Length; charIdx += matchLen; if (matchRepl.Length < matchLen) { // Replacement string is shorter than matched // input: nothing to do } else if (matchRepl.Length > matchLen) { // Replacement string is longer than matched // input: for all the "extra" chars we map // back to a single input offset: for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++) { inputOffsets.Add(output.Length + cumDiff - 1); } } else { // Same length: no change to offset } Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length); } else { inputOffsets.Add(output.Length + cumDiff); output.Append(content[charIdx]); charIdx++; } } string expected = output.ToString(); if (VERBOSE) { Console.Write(" expected:"); for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++) { Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]); } Console.WriteLine(); } MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content)); StringBuilder actualBuilder = new StringBuilder(); IList <int?> actualInputOffsets = new List <int?>(); // Now consume the actual mapFilter, somewhat randomly: while (true) { if (random.Next(0, 1) == 1) { int ch = mapFilter.Read(); if (ch == -1) { break; } actualBuilder.Append((char)ch); } else { char[] buffer = new char[TestUtil.NextInt(random, 1, 100)]; int off = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1); int count = mapFilter.Read(buffer, off, buffer.Length - off); if (count == -1) { break; } else { actualBuilder.Append(buffer, off, count); } } if (random.Next(10) == 7) { // Map offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } } } // Finish mappping offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } string actual = actualBuilder.ToString(); // Verify: assertEquals(expected, actual); assertEquals(inputOffsets, actualInputOffsets); } } }
public virtual void TestChangedOffsets() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("a", "一二"); builder.Add("b", "二三"); NormalizeCharMap norm = builder.Build(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, norm); AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 }); // note: offsets are strange since this is how the charfilter maps them... // before bigramming, the 4 tokens look like: // { 0, 0, 1, 1 }, // { 0, 1, 1, 2 } }