Builds an NormalizeCharMap.

Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap @lucene.experimental

Example #1
0
        // TODO: this should use inputstreams from the loader, not File!
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
        public virtual void inform(ResourceLoader loader)
        {
            if (mapping != null)
            {
                IList <string> wlist       = null;
                File           mappingFile = new File(mapping);
                if (mappingFile.exists())
                {
                    wlist = getLines(loader, mapping);
                }
                else
                {
                    IList <string> files = splitFileNames(mapping);
                    wlist = new List <>();
                    foreach (string file in files)
                    {
                        IList <string> lines = getLines(loader, file.Trim());
                        wlist.AddRange(lines);
                    }
                }
                NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
                parseRules(wlist, builder);
                normMap = builder.build();
                if (normMap.map == null)
                {
                    // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
                    // so just set the whole map to null
                    normMap = null;
                }
            }
        }
 // TODO: this should use inputstreams from the loader, not File!
 public virtual void Inform(IResourceLoader loader)
 {
     if (mapping != null)
     {
         IList<string> wlist = null;
         if (File.Exists(mapping))
         {
             wlist = new List<string>(GetLines(loader, mapping));
         }
         else
         {
             var files = SplitFileNames(mapping);
             wlist = new List<string>();
             foreach (string file in files)
             {
                 var lines = GetLines(loader, file.Trim());
                 wlist.AddRange(lines);
             }
         }
         NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
         ParseRules(wlist, builder);
         normMap = builder.Build();
         if (normMap.map == null)
         {
             // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
             // so just set the whole map to null
             normMap = null;
         }
     }
 }
 // TODO: this should use inputstreams from the loader, not File!
 public virtual void Inform(IResourceLoader loader)
 {
     if (mapping != null)
     {
         IList <string> wlist = null;
         if (File.Exists(mapping))
         {
             wlist = new List <string>(GetLines(loader, mapping));
         }
         else
         {
             var files = SplitFileNames(mapping);
             wlist = new List <string>();
             foreach (string file in files)
             {
                 var lines = GetLines(loader, file.Trim());
                 wlist.AddRange(lines);
             }
         }
         NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
         ParseRules(wlist, builder);
         m_normMap = builder.Build();
         if (m_normMap.map == null)
         {
             // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
             // so just set the whole map to null
             m_normMap = null;
         }
     }
 }
 protected virtual void ParseRules(IList <string> rules, NormalizeCharMap.Builder builder)
 {
     foreach (string rule in rules)
     {
         Match m = p.Match(rule);
         if (!m.Success)
         {
             throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
         }
         builder.Add(ParseString(m.Groups[1].Value), ParseString(m.Groups[2].Value));
     }
 }
Example #5
0
 protected internal virtual void parseRules(IList <string> rules, NormalizeCharMap.Builder builder)
 {
     foreach (string rule in rules)
     {
         Matcher m = p.matcher(rule);
         if (!m.find())
         {
             throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
         }
         builder.add(parseString(m.group(1)), parseString(m.group(2)));
     }
 }
Example #6
0
        public virtual void TestFinalOffsetSpecialCase()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("t", "");
            // even though this below rule has no effect, the test passes if you remove it!!
            builder.Add("tmakdbl", "c");

            NormalizeCharMap map = builder.Build();

            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this, map);

            string text = "gzw f quaxot";

            CheckAnalysisConsistency(Random(), analyzer, false, text);
        }
        public virtual void Test()
        {
            CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false);
            cas.add("jjp");
            cas.add("wlmwoknt");
            cas.add("tcgyreo");

            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("mtqlpi", "");
            builder.Add("mwoknt", "jjp");
            builder.Add("tcgyreo", "zpfpajyws");
            NormalizeCharMap map = builder.Build();

            Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, cas, map);
            CheckAnalysisConsistency(Random(), a, false, "wmgddzunizdomqyj");
        }
        public virtual void TestFinalOffsetSpecialCase()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("t", "");
            // even though this below rule has no effect, the test passes if you remove it!!
            builder.Add("tmakdbl", "c");

            NormalizeCharMap map = builder.Build();

            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                return(new TokenStreamComponents(tokenizer, tokenizer));
            }, initReader: (fieldName, reader) => new MappingCharFilter(map, reader));

            string text = "gzw f quaxot";

            CheckAnalysisConsistency(Random, analyzer, false, text);
        }
        public virtual void TestOffsetCorrection()
        {
            const string INPUT = "G&uuml;nther G&uuml;nther is here";

            // create MappingCharFilter
            IList<string> mappingRules = new List<string>();
            mappingRules.Add("\"&uuml;\" => \"ü\"");
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("&uuml;", "ü");
            NormalizeCharMap normMap = builder.Build();
            CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT));

            // create PatternTokenizer
            TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1);
            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length);

            charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
            stream = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0);
            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length);
        }
Example #10
0
        private NormalizeCharMap RandomMap()
        {
            Random random = Random();

            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            // we can't add duplicate keys, or NormalizeCharMap gets angry
            ISet <string> keys = new HashSet <string>();
            int           num  = random.Next(5);

            //System.out.println("NormalizeCharMap=");
            for (int i = 0; i < num; i++)
            {
                string key = TestUtil.RandomSimpleString(random);
                if (!keys.Contains(key) && key.Length != 0)
                {
                    string value = TestUtil.RandomSimpleString(random);
                    builder.Add(key, value);
                    keys.Add(key);
                    //System.out.println("mapping: '" + key + "' => '" + value + "'");
                }
            }
            return(builder.Build());
        }
Example #11
0
        public override void SetUp()
        {
            base.SetUp();
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

            builder.Add("aa", "a");
            builder.Add("bbb", "b");
            builder.Add("cccc", "cc");

            builder.Add("h", "i");
            builder.Add("j", "jj");
            builder.Add("k", "kkk");
            builder.Add("ll", "llll");

            builder.Add("empty", "");

            // BMP (surrogate pair):
            builder.Add(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1), "fclef");

            builder.Add("\uff01", "full-width-exclamation");

            normMap = builder.Build();
        }
        public override void SetUp()
        {
            base.SetUp();
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

            builder.Add("aa", "a");
            builder.Add("bbb", "b");
            builder.Add("cccc", "cc");

            builder.Add("h", "i");
            builder.Add("j", "jj");
            builder.Add("k", "kkk");
            builder.Add("ll", "llll");

            builder.Add("empty", "");

            // BMP (surrogate pair):
            builder.Add(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1), "fclef");

            builder.Add("\uff01", "full-width-exclamation");

            normMap = builder.Build();
        }
 public virtual void TestNormalizeWinDelimToLinuxDelim()
 {
     NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
     builder.Add("\\", "/");
     NormalizeCharMap normMap = builder.Build();
     string path = "c:\\a\\b\\c";
     Reader cs = new MappingCharFilter(normMap, new StringReader(path));
     PathHierarchyTokenizer t = new PathHierarchyTokenizer(cs);
     AssertTokenStreamContents(t, new string[] { "c:", "c:/a", "c:/a/b", "c:/a/b/c" }, new int[] { 0, 0, 0, 0 }, new int[] { 2, 4, 6, 8 }, new int[] { 1, 0, 0, 0 }, path.Length);
 }
 private NormalizeCharMap RandomMap()
 {
     Random random = Random();
     NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
     // we can't add duplicate keys, or NormalizeCharMap gets angry
     ISet<string> keys = new HashSet<string>();
     int num = random.Next(5);
     //System.out.println("NormalizeCharMap=");
     for (int i = 0; i < num; i++)
     {
         string key = TestUtil.RandomSimpleString(random);
         if (!keys.Contains(key) && key.Length != 0)
         {
             string value = TestUtil.RandomSimpleString(random);
             builder.Add(key, value);
             keys.Add(key);
             //System.out.println("mapping: '" + key + "' => '" + value + "'");
         }
     }
     return builder.Build();
 }
        public virtual void TestInvalidOffsets()
        {
            CharArraySet dict = makeDictionary("fall");
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("ü", "ue");
            NormalizeCharMap normMap = builder.Build();

            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, dict, normMap);

            AssertAnalyzesTo(analyzer, "banküberfall", new string[] { "bankueberfall", "fall" }, new int[] { 0, 0 }, new int[] { 12, 12 });
        }
        public virtual void TestRandomMaps2()
        {
            Random random = Random();
            int numIterations = AtLeast(3);
            for (int iter = 0; iter < numIterations; iter++)
            {

                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST iter=" + iter);
                }

                char endLetter = (char)TestUtil.NextInt(random, 'b', 'z');
                IDictionary<string, string> map = new Dictionary<string, string>();
                NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
                int numMappings = AtLeast(5);
                if (VERBOSE)
                {
                    Console.WriteLine("  mappings:");
                }
                while (map.Count < numMappings)
                {
                    string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7);
                    if (key.Length != 0 && !map.ContainsKey(key))
                    {
                        string value = TestUtil.RandomSimpleString(random);
                        map[key] = value;
                        builder.Add(key, value);
                        if (VERBOSE)
                        {
                            Console.WriteLine("    " + key + " -> " + value);
                        }
                    }
                }

                NormalizeCharMap charMap = builder.Build();

                if (VERBOSE)
                {
                    Console.WriteLine("  test random documents...");
                }

                for (int iter2 = 0; iter2 < 100; iter2++)
                {
                    string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000));

                    if (VERBOSE)
                    {
                        Console.WriteLine("  content=" + content);
                    }

                    // Do stupid dog-slow mapping:

                    // Output string:
                    StringBuilder output = new StringBuilder();

                    // Maps output offset to input offset:
                    IList<int?> inputOffsets = new List<int?>();

                    int cumDiff = 0;
                    int charIdx = 0;
                    while (charIdx < content.Length)
                    {

                        int matchLen = -1;
                        string matchRepl = null;

                        foreach (KeyValuePair<string, string> ent in map)
                        {
                            string match = ent.Key;
                            if (charIdx + match.Length <= content.Length)
                            {
                                int limit = charIdx + match.Length;
                                bool matches = true;
                                for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++)
                                {
                                    if (match[charIdx2 - charIdx] != content[charIdx2])
                                    {
                                        matches = false;
                                        break;
                                    }
                                }

                                if (matches)
                                {
                                    string repl = ent.Value;
                                    if (match.Length > matchLen)
                                    {
                                        // Greedy: longer match wins
                                        matchLen = match.Length;
                                        matchRepl = repl;
                                    }
                                }
                            }
                        }

                        if (matchLen != -1)
                        {
                            // We found a match here!
                            if (VERBOSE)
                            {
                                Console.WriteLine("    match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl);
                            }
                            output.Append(matchRepl);
                            int minLen = Math.Min(matchLen, matchRepl.Length);

                            // Common part, directly maps back to input
                            // offset:
                            for (int outIdx = 0; outIdx < minLen; outIdx++)
                            {
                                inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff);
                            }

                            cumDiff += matchLen - matchRepl.Length;
                            charIdx += matchLen;

                            if (matchRepl.Length < matchLen)
                            {
                                // Replacement string is shorter than matched
                                // input: nothing to do
                            }
                            else if (matchRepl.Length > matchLen)
                            {
                                // Replacement string is longer than matched
                                // input: for all the "extra" chars we map
                                // back to a single input offset:
                                for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++)
                                {
                                    inputOffsets.Add(output.Length + cumDiff - 1);
                                }
                            }
                            else
                            {
                                // Same length: no change to offset
                            }

                            Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length);
                        }
                        else
                        {
                            inputOffsets.Add(output.Length + cumDiff);
                            output.Append(content[charIdx]);
                            charIdx++;
                        }
                    }

                    string expected = output.ToString();
                    if (VERBOSE)
                    {
                        Console.Write("    expected:");
                        for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++)
                        {
                            Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]);
                        }
                        Console.WriteLine();
                    }

                    MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content));
                    StringBuilder actualBuilder = new StringBuilder();
                    IList<int?> actualInputOffsets = new List<int?>();

                    // Now consume the actual mapFilter, somewhat randomly:
                    while (true)
                    {
                        if (random.Next(0, 1) == 1)
                        {
                            int ch = mapFilter.Read();
                            if (ch == -1)
                            {
                                break;
                            }
                            actualBuilder.Append((char)ch);
                        }
                        else
                        {
                            char[] buffer = new char[TestUtil.NextInt(random, 1, 100)];
                            int off = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1);
                            int count = mapFilter.Read(buffer, off, buffer.Length - off);
                            if (count == -1)
                            {
                                break;
                            }
                            else
                            {
                                actualBuilder.Append(buffer, off, count);
                            }
                        }

                        if (random.Next(10) == 7)
                        {
                            // Map offsets
                            while (actualInputOffsets.Count < actualBuilder.Length)
                            {
                                actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                            }
                        }
                    }

                    // Finish mappping offsets
                    while (actualInputOffsets.Count < actualBuilder.Length)
                    {
                        actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                    }

                    string actual = actualBuilder.ToString();

                    // Verify:
                    assertEquals(expected, actual);
                    assertEquals(inputOffsets, actualInputOffsets);
                }
            }
        }
Example #17
0
        public virtual void TestRandomMaps2()
        {
            Random random        = Random();
            int    numIterations = AtLeast(3);

            for (int iter = 0; iter < numIterations; iter++)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST iter=" + iter);
                }

                char endLetter = (char)TestUtil.NextInt(random, 'b', 'z');
                IDictionary <string, string> map     = new Dictionary <string, string>();
                NormalizeCharMap.Builder     builder = new NormalizeCharMap.Builder();
                int numMappings = AtLeast(5);
                if (VERBOSE)
                {
                    Console.WriteLine("  mappings:");
                }
                while (map.Count < numMappings)
                {
                    string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7);
                    if (key.Length != 0 && !map.ContainsKey(key))
                    {
                        string value = TestUtil.RandomSimpleString(random);
                        map[key] = value;
                        builder.Add(key, value);
                        if (VERBOSE)
                        {
                            Console.WriteLine("    " + key + " -> " + value);
                        }
                    }
                }

                NormalizeCharMap charMap = builder.Build();

                if (VERBOSE)
                {
                    Console.WriteLine("  test random documents...");
                }

                for (int iter2 = 0; iter2 < 100; iter2++)
                {
                    string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000));

                    if (VERBOSE)
                    {
                        Console.WriteLine("  content=" + content);
                    }

                    // Do stupid dog-slow mapping:

                    // Output string:
                    StringBuilder output = new StringBuilder();

                    // Maps output offset to input offset:
                    IList <int?> inputOffsets = new List <int?>();

                    int cumDiff = 0;
                    int charIdx = 0;
                    while (charIdx < content.Length)
                    {
                        int    matchLen  = -1;
                        string matchRepl = null;

                        foreach (KeyValuePair <string, string> ent in map)
                        {
                            string match = ent.Key;
                            if (charIdx + match.Length <= content.Length)
                            {
                                int  limit   = charIdx + match.Length;
                                bool matches = true;
                                for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++)
                                {
                                    if (match[charIdx2 - charIdx] != content[charIdx2])
                                    {
                                        matches = false;
                                        break;
                                    }
                                }

                                if (matches)
                                {
                                    string repl = ent.Value;
                                    if (match.Length > matchLen)
                                    {
                                        // Greedy: longer match wins
                                        matchLen  = match.Length;
                                        matchRepl = repl;
                                    }
                                }
                            }
                        }

                        if (matchLen != -1)
                        {
                            // We found a match here!
                            if (VERBOSE)
                            {
                                Console.WriteLine("    match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl);
                            }
                            output.Append(matchRepl);
                            int minLen = Math.Min(matchLen, matchRepl.Length);

                            // Common part, directly maps back to input
                            // offset:
                            for (int outIdx = 0; outIdx < minLen; outIdx++)
                            {
                                inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff);
                            }

                            cumDiff += matchLen - matchRepl.Length;
                            charIdx += matchLen;

                            if (matchRepl.Length < matchLen)
                            {
                                // Replacement string is shorter than matched
                                // input: nothing to do
                            }
                            else if (matchRepl.Length > matchLen)
                            {
                                // Replacement string is longer than matched
                                // input: for all the "extra" chars we map
                                // back to a single input offset:
                                for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++)
                                {
                                    inputOffsets.Add(output.Length + cumDiff - 1);
                                }
                            }
                            else
                            {
                                // Same length: no change to offset
                            }

                            Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length);
                        }
                        else
                        {
                            inputOffsets.Add(output.Length + cumDiff);
                            output.Append(content[charIdx]);
                            charIdx++;
                        }
                    }

                    string expected = output.ToString();
                    if (VERBOSE)
                    {
                        Console.Write("    expected:");
                        for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++)
                        {
                            Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]);
                        }
                        Console.WriteLine();
                    }

                    MappingCharFilter mapFilter          = new MappingCharFilter(charMap, new StringReader(content));
                    StringBuilder     actualBuilder      = new StringBuilder();
                    IList <int?>      actualInputOffsets = new List <int?>();

                    // Now consume the actual mapFilter, somewhat randomly:
                    while (true)
                    {
                        if (random.Next(0, 1) == 1)
                        {
                            int ch = mapFilter.Read();
                            if (ch == -1)
                            {
                                break;
                            }
                            actualBuilder.Append((char)ch);
                        }
                        else
                        {
                            char[] buffer = new char[TestUtil.NextInt(random, 1, 100)];
                            int    off    = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1);
                            int    count  = mapFilter.Read(buffer, off, buffer.Length - off);
                            if (count == -1)
                            {
                                break;
                            }
                            else
                            {
                                actualBuilder.Append(buffer, off, count);
                            }
                        }

                        if (random.Next(10) == 7)
                        {
                            // Map offsets
                            while (actualInputOffsets.Count < actualBuilder.Length)
                            {
                                actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                            }
                        }
                    }

                    // Finish mappping offsets
                    while (actualInputOffsets.Count < actualBuilder.Length)
                    {
                        actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                    }

                    string actual = actualBuilder.ToString();

                    // Verify:
                    assertEquals(expected, actual);
                    assertEquals(inputOffsets, actualInputOffsets);
                }
            }
        }
        public virtual void TestChangedOffsets()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("a", "一二");
            builder.Add("b", "二三");
            NormalizeCharMap norm = builder.Build();
            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, norm);

            AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 });

            // note: offsets are strange since this is how the charfilter maps them... 
            // before bigramming, the 4 tokens look like:
            //   { 0, 0, 1, 1 },
            //   { 0, 1, 1, 2 }
        }
        public virtual void TestFinalOffsetSpecialCase()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("t", "");
            // even though this below rule has no effect, the test passes if you remove it!!
            builder.Add("tmakdbl", "c");

            NormalizeCharMap map = builder.Build();

            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this, map);

            string text = "gzw f quaxot";
            CheckAnalysisConsistency(Random(), analyzer, false, text);
        }