Simplistic CharFilter that applies the mappings contained in a NormalizeCharMap to the character stream, and correcting the resulting changes to the offsets. Matching is greedy (longest pattern matching at a given point wins). Replacement is allowed to be the empty string.
Inheritance: Lucene.Net.Analysis.CharFilters.BaseCharFilter
コード例 #1
0
        public virtual void Test4to2()
        {
            CharFilter  cs = new MappingCharFilter(normMap, new StringReader("cccc"));
            TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);

            AssertTokenStreamContents(ts, new string[] { "cc" }, new int[] { 0 }, new int[] { 4 }, 4);
        }
コード例 #2
0
        public virtual void TestNonBMPChar()
        {
            CharFilter  cs = new MappingCharFilter(normMap, new StringReader(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1)));
            TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);

            AssertTokenStreamContents(ts, new string[] { "fclef" }, new int[] { 0 }, new int[] { 2 }, 2);
        }
コード例 #3
0
        public virtual void Test5to0()
        {
            CharFilter  cs = new MappingCharFilter(normMap, new StringReader("empty"));
            TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);

            AssertTokenStreamContents(ts, new string[0], new int[] { }, new int[] { }, 5);
        }
コード例 #4
0
        public virtual void TestFullWidthChar()
        {
            CharFilter  cs = new MappingCharFilter(normMap, new StringReader("\uff01"));
            TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);

            AssertTokenStreamContents(ts, new string[] { "full-width-exclamation" }, new int[] { 0 }, new int[] { 1 }, 1);
        }
コード例 #5
0
        public virtual void Test1to3()
        {
            CharFilter  cs = new MappingCharFilter(normMap, new StringReader("k"));
            TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);

            AssertTokenStreamContents(ts, new string[] { "kkk" }, new int[] { 0 }, new int[] { 1 }, 1);
        }
コード例 #6
0
        public virtual void Test3to1()
        {
            CharFilter  cs = new MappingCharFilter(normMap, new StringReader("bbb"));
            TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);

            AssertTokenStreamContents(ts, new string[] { "b" }, new int[] { 0 }, new int[] { 3 }, 3);
        }
コード例 #7
0
        public virtual void TestTokenStream()
        {
            string      testString = "h i j k ll cccc bbb aa";
            CharFilter  cs         = new MappingCharFilter(normMap, new StringReader(testString));
            TokenStream ts         = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);

            AssertTokenStreamContents(ts, new string[] { "i", "i", "jj", "kkk", "llll", "cc", "b", "a" }, new int[] { 0, 2, 4, 6, 8, 11, 16, 20 }, new int[] { 1, 3, 5, 7, 10, 15, 19, 22 }, testString.Length);
        }
コード例 #8
0
        public virtual void TestChained()
        {
            string      testString = "aaaa ll h";
            CharFilter  cs         = new MappingCharFilter(normMap, new MappingCharFilter(normMap, new StringReader(testString)));
            TokenStream ts         = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);

            AssertTokenStreamContents(ts, new string[] { "a", "llllllll", "i" }, new int[] { 0, 5, 8 }, new int[] { 4, 7, 9 }, testString.Length);
        }
コード例 #9
0
        public virtual void TestReaderReset()
        {
            CharFilter cs = new MappingCharFilter(normMap, new StringReader("x"));
            char[] buf = new char[10];
            int len = cs.Read(buf, 0, 10);
            assertEquals(1, len);
            assertEquals('x', buf[0]);
            len = cs.Read(buf, 0, 10);
            assertEquals(-1, len);

            // rewind
            cs.Reset();
            len = cs.Read(buf, 0, 10);
            assertEquals(1, len);
            assertEquals('x', buf[0]);
        }
コード例 #10
0
        public virtual void TestReaderReset()
        {
            CharFilter cs = new MappingCharFilter(normMap, new StringReader("x"));

            char[] buf = new char[10];
            int    len = cs.Read(buf, 0, 10);

            assertEquals(1, len);
            assertEquals('x', buf[0]);
            len = cs.Read(buf, 0, 10);
            assertEquals(-1, len);

            // rewind
            cs.Reset();
            len = cs.Read(buf, 0, 10);
            assertEquals(1, len);
            assertEquals('x', buf[0]);
        }
コード例 #11
0
        public virtual void TestOffsetCorrection()
        {
            const string INPUT = "Günther Günther is here";

            // create MappingCharFilter
            IList<string> mappingRules = new List<string>();
            mappingRules.Add("\"&uuml;\" => \"ü\"");
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("&uuml;", "ü");
            NormalizeCharMap normMap = builder.Build();
            CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT));

            // create PatternTokenizer
            TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1);
            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length);

            charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
            stream = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0);
            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length);
        }
コード例 #12
0
 public override TextReader InitReader(string fieldName, TextReader reader)
 {
     reader = new MockCharFilter(reader, 0);
     reader = new MappingCharFilter(map, reader);
     return reader;
 }
コード例 #13
0
        public virtual void TestRandomMaps2()
        {
            Random random        = Random();
            int    numIterations = AtLeast(3);

            for (int iter = 0; iter < numIterations; iter++)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST iter=" + iter);
                }

                char endLetter = (char)TestUtil.NextInt(random, 'b', 'z');
                IDictionary <string, string> map     = new Dictionary <string, string>();
                NormalizeCharMap.Builder     builder = new NormalizeCharMap.Builder();
                int numMappings = AtLeast(5);
                if (VERBOSE)
                {
                    Console.WriteLine("  mappings:");
                }
                while (map.Count < numMappings)
                {
                    string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7);
                    if (key.Length != 0 && !map.ContainsKey(key))
                    {
                        string value = TestUtil.RandomSimpleString(random);
                        map[key] = value;
                        builder.Add(key, value);
                        if (VERBOSE)
                        {
                            Console.WriteLine("    " + key + " -> " + value);
                        }
                    }
                }

                NormalizeCharMap charMap = builder.Build();

                if (VERBOSE)
                {
                    Console.WriteLine("  test random documents...");
                }

                for (int iter2 = 0; iter2 < 100; iter2++)
                {
                    string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000));

                    if (VERBOSE)
                    {
                        Console.WriteLine("  content=" + content);
                    }

                    // Do stupid dog-slow mapping:

                    // Output string:
                    StringBuilder output = new StringBuilder();

                    // Maps output offset to input offset:
                    IList <int?> inputOffsets = new List <int?>();

                    int cumDiff = 0;
                    int charIdx = 0;
                    while (charIdx < content.Length)
                    {
                        int    matchLen  = -1;
                        string matchRepl = null;

                        foreach (KeyValuePair <string, string> ent in map)
                        {
                            string match = ent.Key;
                            if (charIdx + match.Length <= content.Length)
                            {
                                int  limit   = charIdx + match.Length;
                                bool matches = true;
                                for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++)
                                {
                                    if (match[charIdx2 - charIdx] != content[charIdx2])
                                    {
                                        matches = false;
                                        break;
                                    }
                                }

                                if (matches)
                                {
                                    string repl = ent.Value;
                                    if (match.Length > matchLen)
                                    {
                                        // Greedy: longer match wins
                                        matchLen  = match.Length;
                                        matchRepl = repl;
                                    }
                                }
                            }
                        }

                        if (matchLen != -1)
                        {
                            // We found a match here!
                            if (VERBOSE)
                            {
                                Console.WriteLine("    match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl);
                            }
                            output.Append(matchRepl);
                            int minLen = Math.Min(matchLen, matchRepl.Length);

                            // Common part, directly maps back to input
                            // offset:
                            for (int outIdx = 0; outIdx < minLen; outIdx++)
                            {
                                inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff);
                            }

                            cumDiff += matchLen - matchRepl.Length;
                            charIdx += matchLen;

                            if (matchRepl.Length < matchLen)
                            {
                                // Replacement string is shorter than matched
                                // input: nothing to do
                            }
                            else if (matchRepl.Length > matchLen)
                            {
                                // Replacement string is longer than matched
                                // input: for all the "extra" chars we map
                                // back to a single input offset:
                                for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++)
                                {
                                    inputOffsets.Add(output.Length + cumDiff - 1);
                                }
                            }
                            else
                            {
                                // Same length: no change to offset
                            }

                            Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length);
                        }
                        else
                        {
                            inputOffsets.Add(output.Length + cumDiff);
                            output.Append(content[charIdx]);
                            charIdx++;
                        }
                    }

                    string expected = output.ToString();
                    if (VERBOSE)
                    {
                        Console.Write("    expected:");
                        for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++)
                        {
                            Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]);
                        }
                        Console.WriteLine();
                    }

                    MappingCharFilter mapFilter          = new MappingCharFilter(charMap, new StringReader(content));
                    StringBuilder     actualBuilder      = new StringBuilder();
                    IList <int?>      actualInputOffsets = new List <int?>();

                    // Now consume the actual mapFilter, somewhat randomly:
                    while (true)
                    {
                        if (random.Next(0, 1) == 1)
                        {
                            int ch = mapFilter.Read();
                            if (ch == -1)
                            {
                                break;
                            }
                            actualBuilder.Append((char)ch);
                        }
                        else
                        {
                            char[] buffer = new char[TestUtil.NextInt(random, 1, 100)];
                            int    off    = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1);
                            int    count  = mapFilter.Read(buffer, off, buffer.Length - off);
                            if (count == -1)
                            {
                                break;
                            }
                            else
                            {
                                actualBuilder.Append(buffer, off, count);
                            }
                        }

                        if (random.Next(10) == 7)
                        {
                            // Map offsets
                            while (actualInputOffsets.Count < actualBuilder.Length)
                            {
                                actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                            }
                        }
                    }

                    // Finish mappping offsets
                    while (actualInputOffsets.Count < actualBuilder.Length)
                    {
                        actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                    }

                    string actual = actualBuilder.ToString();

                    // Verify:
                    assertEquals(expected, actual);
                    assertEquals(inputOffsets, actualInputOffsets);
                }
            }
        }
コード例 #14
0
 public virtual void Test5to0()
 {
     CharFilter cs = new MappingCharFilter(normMap, new StringReader("empty"));
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
     AssertTokenStreamContents(ts, new string[0], new int[] { }, new int[] { }, 5);
 }
コード例 #15
0
 public virtual void TestNonBMPChar()
 {
     CharFilter cs = new MappingCharFilter(normMap, new StringReader(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1)));
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
     AssertTokenStreamContents(ts, new string[] { "fclef" }, new int[] { 0 }, new int[] { 2 }, 2);
 }
コード例 #16
0
 public virtual void TestNormalizeWinDelimToLinuxDelim()
 {
     NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
     builder.Add("\\", "/");
     NormalizeCharMap normMap = builder.Build();
     string path = "c:\\a\\b\\c";
     Reader cs = new MappingCharFilter(normMap, new StringReader(path));
     PathHierarchyTokenizer t = new PathHierarchyTokenizer(cs);
     AssertTokenStreamContents(t, new string[] { "c:", "c:/a", "c:/a/b", "c:/a/b/c" }, new int[] { 0, 0, 0, 0 }, new int[] { 2, 4, 6, 8 }, new int[] { 1, 0, 0, 0 }, path.Length);
 }
コード例 #17
0
 public virtual void TestFullWidthChar()
 {
     CharFilter cs = new MappingCharFilter(normMap, new StringReader("\uff01"));
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
     AssertTokenStreamContents(ts, new string[] { "full-width-exclamation" }, new int[] { 0 }, new int[] { 1 }, 1);
 }
コード例 #18
0
 public virtual void TestTokenStream()
 {
     string testString = "h i j k ll cccc bbb aa";
     CharFilter cs = new MappingCharFilter(normMap, new StringReader(testString));
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
     AssertTokenStreamContents(ts, new string[] { "i", "i", "jj", "kkk", "llll", "cc", "b", "a" }, new int[] { 0, 2, 4, 6, 8, 11, 16, 20 }, new int[] { 1, 3, 5, 7, 10, 15, 19, 22 }, testString.Length);
 }
コード例 #19
0
 public virtual void TestChained()
 {
     string testString = "aaaa ll h";
     CharFilter cs = new MappingCharFilter(normMap, new MappingCharFilter(normMap, new StringReader(testString)));
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
     AssertTokenStreamContents(ts, new string[] { "a", "llllllll", "i" }, new int[] { 0, 5, 8 }, new int[] { 4, 7, 9 }, testString.Length);
 }
コード例 #20
0
        public virtual void TestRandomMaps2()
        {
            Random random = Random();
            int numIterations = AtLeast(3);
            for (int iter = 0; iter < numIterations; iter++)
            {

                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST iter=" + iter);
                }

                char endLetter = (char)TestUtil.NextInt(random, 'b', 'z');
                IDictionary<string, string> map = new Dictionary<string, string>();
                NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
                int numMappings = AtLeast(5);
                if (VERBOSE)
                {
                    Console.WriteLine("  mappings:");
                }
                while (map.Count < numMappings)
                {
                    string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7);
                    if (key.Length != 0 && !map.ContainsKey(key))
                    {
                        string value = TestUtil.RandomSimpleString(random);
                        map[key] = value;
                        builder.Add(key, value);
                        if (VERBOSE)
                        {
                            Console.WriteLine("    " + key + " -> " + value);
                        }
                    }
                }

                NormalizeCharMap charMap = builder.Build();

                if (VERBOSE)
                {
                    Console.WriteLine("  test random documents...");
                }

                for (int iter2 = 0; iter2 < 100; iter2++)
                {
                    string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000));

                    if (VERBOSE)
                    {
                        Console.WriteLine("  content=" + content);
                    }

                    // Do stupid dog-slow mapping:

                    // Output string:
                    StringBuilder output = new StringBuilder();

                    // Maps output offset to input offset:
                    IList<int?> inputOffsets = new List<int?>();

                    int cumDiff = 0;
                    int charIdx = 0;
                    while (charIdx < content.Length)
                    {

                        int matchLen = -1;
                        string matchRepl = null;

                        foreach (KeyValuePair<string, string> ent in map)
                        {
                            string match = ent.Key;
                            if (charIdx + match.Length <= content.Length)
                            {
                                int limit = charIdx + match.Length;
                                bool matches = true;
                                for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++)
                                {
                                    if (match[charIdx2 - charIdx] != content[charIdx2])
                                    {
                                        matches = false;
                                        break;
                                    }
                                }

                                if (matches)
                                {
                                    string repl = ent.Value;
                                    if (match.Length > matchLen)
                                    {
                                        // Greedy: longer match wins
                                        matchLen = match.Length;
                                        matchRepl = repl;
                                    }
                                }
                            }
                        }

                        if (matchLen != -1)
                        {
                            // We found a match here!
                            if (VERBOSE)
                            {
                                Console.WriteLine("    match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl);
                            }
                            output.Append(matchRepl);
                            int minLen = Math.Min(matchLen, matchRepl.Length);

                            // Common part, directly maps back to input
                            // offset:
                            for (int outIdx = 0; outIdx < minLen; outIdx++)
                            {
                                inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff);
                            }

                            cumDiff += matchLen - matchRepl.Length;
                            charIdx += matchLen;

                            if (matchRepl.Length < matchLen)
                            {
                                // Replacement string is shorter than matched
                                // input: nothing to do
                            }
                            else if (matchRepl.Length > matchLen)
                            {
                                // Replacement string is longer than matched
                                // input: for all the "extra" chars we map
                                // back to a single input offset:
                                for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++)
                                {
                                    inputOffsets.Add(output.Length + cumDiff - 1);
                                }
                            }
                            else
                            {
                                // Same length: no change to offset
                            }

                            Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length);
                        }
                        else
                        {
                            inputOffsets.Add(output.Length + cumDiff);
                            output.Append(content[charIdx]);
                            charIdx++;
                        }
                    }

                    string expected = output.ToString();
                    if (VERBOSE)
                    {
                        Console.Write("    expected:");
                        for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++)
                        {
                            Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]);
                        }
                        Console.WriteLine();
                    }

                    MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content));
                    StringBuilder actualBuilder = new StringBuilder();
                    IList<int?> actualInputOffsets = new List<int?>();

                    // Now consume the actual mapFilter, somewhat randomly:
                    while (true)
                    {
                        if (random.Next(0, 1) == 1)
                        {
                            int ch = mapFilter.Read();
                            if (ch == -1)
                            {
                                break;
                            }
                            actualBuilder.Append((char)ch);
                        }
                        else
                        {
                            char[] buffer = new char[TestUtil.NextInt(random, 1, 100)];
                            int off = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1);
                            int count = mapFilter.Read(buffer, off, buffer.Length - off);
                            if (count == -1)
                            {
                                break;
                            }
                            else
                            {
                                actualBuilder.Append(buffer, off, count);
                            }
                        }

                        if (random.Next(10) == 7)
                        {
                            // Map offsets
                            while (actualInputOffsets.Count < actualBuilder.Length)
                            {
                                actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                            }
                        }
                    }

                    // Finish mappping offsets
                    while (actualInputOffsets.Count < actualBuilder.Length)
                    {
                        actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                    }

                    string actual = actualBuilder.ToString();

                    // Verify:
                    assertEquals(expected, actual);
                    assertEquals(inputOffsets, actualInputOffsets);
                }
            }
        }
コード例 #21
0
 public virtual void Test3to1()
 {
     CharFilter cs = new MappingCharFilter(normMap, new StringReader("bbb"));
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
     AssertTokenStreamContents(ts, new string[] { "b" }, new int[] { 0 }, new int[] { 3 }, 3);
 }
コード例 #22
0
 public virtual void Test1to2()
 {
     CharFilter cs = new MappingCharFilter(normMap, new StringReader("j"));
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
     AssertTokenStreamContents(ts, new string[] { "jj" }, new int[] { 0 }, new int[] { 1 }, 1);
 }