Holds a map of String input to String output, to be used with Builder. Use the MappingCharFilter to create this.
 // TODO: this should use inputstreams from the loader, not File!
 public virtual void Inform(IResourceLoader loader)
 {
     if (mapping != null)
     {
         IList <string> wlist = null;
         if (File.Exists(mapping))
         {
             wlist = new List <string>(GetLines(loader, mapping));
         }
         else
         {
             var files = SplitFileNames(mapping);
             wlist = new List <string>();
             foreach (string file in files)
             {
                 var lines = GetLines(loader, file.Trim());
                 wlist.AddRange(lines);
             }
         }
         NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
         ParseRules(wlist, builder);
         m_normMap = builder.Build();
         if (m_normMap.map == null)
         {
             // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
             // so just set the whole map to null
             m_normMap = null;
         }
     }
 }
 // TODO: this should use inputstreams from the loader, not File!
 public virtual void Inform(IResourceLoader loader)
 {
     if (mapping != null)
     {
         IList<string> wlist = null;
         if (File.Exists(mapping))
         {
             wlist = new List<string>(GetLines(loader, mapping));
         }
         else
         {
             var files = SplitFileNames(mapping);
             wlist = new List<string>();
             foreach (string file in files)
             {
                 var lines = GetLines(loader, file.Trim());
                 wlist.AddRange(lines);
             }
         }
         NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
         ParseRules(wlist, builder);
         normMap = builder.Build();
         if (normMap.map == null)
         {
             // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
             // so just set the whole map to null
             normMap = null;
         }
     }
 }
Beispiel #3
0
        // TODO: this should use inputstreams from the loader, not File!
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
        public virtual void inform(ResourceLoader loader)
        {
            if (mapping != null)
            {
                IList <string> wlist       = null;
                File           mappingFile = new File(mapping);
                if (mappingFile.exists())
                {
                    wlist = getLines(loader, mapping);
                }
                else
                {
                    IList <string> files = splitFileNames(mapping);
                    wlist = new List <>();
                    foreach (string file in files)
                    {
                        IList <string> lines = getLines(loader, file.Trim());
                        wlist.AddRange(lines);
                    }
                }
                NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
                parseRules(wlist, builder);
                normMap = builder.build();
                if (normMap.map == null)
                {
                    // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
                    // so just set the whole map to null
                    normMap = null;
                }
            }
        }
Beispiel #4
0
        public virtual void TestRandomMaps()
        {
            int numIterations = AtLeast(3);

            for (int i = 0; i < numIterations; i++)
            {
                NormalizeCharMap map      = RandomMap();
                Analyzer         analyzer = new AnalyzerAnonymousInnerClassHelper3(this, map);
                int numRounds             = 100;
                CheckRandomData(Random(), analyzer, numRounds);
            }
        }
Beispiel #5
0
        public virtual void TestFinalOffsetSpecialCase()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("t", "");
            // even though this below rule has no effect, the test passes if you remove it!!
            builder.Add("tmakdbl", "c");

            NormalizeCharMap map = builder.Build();

            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this, map);

            string text = "gzw f quaxot";

            CheckAnalysisConsistency(Random(), analyzer, false, text);
        }
        public virtual void TestRandomMaps()
        {
            int numIterations = AtLeast(3);

            for (int i = 0; i < numIterations; i++)
            {
                NormalizeCharMap map      = RandomMap();
                Analyzer         analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                {
                    Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                    return(new TokenStreamComponents(tokenizer, tokenizer));
                }, initReader: (fieldName, reader) => new MappingCharFilter(map, reader));
                int numRounds = 100;
                CheckRandomData(Random, analyzer, numRounds);
            }
        }
Beispiel #7
0
        /// <summary>
        /// Default constructor that takes a <seealso cref="TextReader"/>. </summary>
        public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in)
        {
            buffer.Reset(@in);

            map            = normMap.map;
            cachedRootArcs = normMap.cachedRootArcs;

            if (map != null)
            {
                fstReader = map.BytesReader;
            }
            else
            {
                fstReader = null;
            }
        }
        /// <summary>
        /// Default constructor that takes a <seealso cref="TextReader"/>. </summary>
        public MappingCharFilter(NormalizeCharMap normMap, TextReader @in)
            : base(@in)
        {
            buffer.Reset(@in);

            map = normMap.map;
            cachedRootArcs = normMap.cachedRootArcs;

            if (map != null)
            {
              fstReader = map.BytesReader;
            }
            else
            {
              fstReader = null;
            }
        }
        public virtual void TestFinalOffsetSpecialCase()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("t", "");
            // even though this below rule has no effect, the test passes if you remove it!!
            builder.Add("tmakdbl", "c");

            NormalizeCharMap map = builder.Build();

            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                return(new TokenStreamComponents(tokenizer, tokenizer));
            }, initReader: (fieldName, reader) => new MappingCharFilter(map, reader));

            string text = "gzw f quaxot";

            CheckAnalysisConsistency(Random, analyzer, false, text);
        }
        /// <summary>
        /// Default constructor that takes a <seealso cref="TextReader"/>. </summary>
        public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in)
        {
            //LUCENENET support to reset the reader.
            _input = GetBufferedReader(@in);
            _input.Mark(BufferedCharFilter.defaultCharBufferSize);
            buffer.Reset(_input);
            //buffer.Reset(@in);

            map = normMap.map;
            cachedRootArcs = normMap.cachedRootArcs;

            if (map != null)
            {
                fstReader = map.BytesReader;
            }
            else
            {
                fstReader = null;
            }
        }
Beispiel #11
0
        /// <summary>
        /// Default constructor that takes a <seealso cref="TextReader"/>. </summary>
        public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in)
        {
            //LUCENENET support to reset the reader.
            _input = GetBufferedReader(@in);
            _input.Mark(BufferedCharFilter.defaultCharBufferSize);
            buffer.Reset(_input);
            //buffer.Reset(@in);

            map            = normMap.map;
            cachedRootArcs = normMap.cachedRootArcs;

            if (map != null)
            {
                fstReader = map.BytesReader;
            }
            else
            {
                fstReader = null;
            }
        }
Beispiel #12
0
        /// <summary>
        /// Default constructor that takes a <see cref="TextReader"/>. </summary>
        public MappingCharFilter(NormalizeCharMap normMap, TextReader @in)
            : base(@in)
        {
            //LUCENENET support to reset the reader.
            _input = GetBufferedReader(@in);
            _input.Mark(BufferedCharFilter.DEFAULT_CHAR_BUFFER_SIZE);
            buffer.Reset(_input);
            //buffer.Reset(@in);

            map            = normMap.map;
            cachedRootArcs = normMap.cachedRootArcs;

            if (map != null)
            {
                fstReader = map.GetBytesReader();
            }
            else
            {
                fstReader = null;
            }
        }
        public override void SetUp()
        {
            base.SetUp();
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

            builder.Add("aa", "a");
            builder.Add("bbb", "b");
            builder.Add("cccc", "cc");

            builder.Add("h", "i");
            builder.Add("j", "jj");
            builder.Add("k", "kkk");
            builder.Add("ll", "llll");

            builder.Add("empty", "");

            // BMP (surrogate pair):
            builder.Add(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1), "fclef");

            builder.Add("\uff01", "full-width-exclamation");

            normMap = builder.Build();
        }
Beispiel #14
0
        public override void SetUp()
        {
            base.SetUp();
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

            builder.Add("aa", "a");
            builder.Add("bbb", "b");
            builder.Add("cccc", "cc");

            builder.Add("h", "i");
            builder.Add("j", "jj");
            builder.Add("k", "kkk");
            builder.Add("ll", "llll");

            builder.Add("empty", "");

            // BMP (surrogate pair):
            builder.Add(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1), "fclef");

            builder.Add("\uff01", "full-width-exclamation");

            normMap = builder.Build();
        }
Beispiel #15
0
        public virtual void TestRandomMaps2()
        {
            Random random        = Random();
            int    numIterations = AtLeast(3);

            for (int iter = 0; iter < numIterations; iter++)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("\nTEST iter=" + iter);
                }

                char endLetter = (char)TestUtil.NextInt(random, 'b', 'z');
                IDictionary <string, string> map     = new Dictionary <string, string>();
                NormalizeCharMap.Builder     builder = new NormalizeCharMap.Builder();
                int numMappings = AtLeast(5);
                if (VERBOSE)
                {
                    Console.WriteLine("  mappings:");
                }
                while (map.Count < numMappings)
                {
                    string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7);
                    if (key.Length != 0 && !map.ContainsKey(key))
                    {
                        string value = TestUtil.RandomSimpleString(random);
                        map[key] = value;
                        builder.Add(key, value);
                        if (VERBOSE)
                        {
                            Console.WriteLine("    " + key + " -> " + value);
                        }
                    }
                }

                NormalizeCharMap charMap = builder.Build();

                if (VERBOSE)
                {
                    Console.WriteLine("  test random documents...");
                }

                for (int iter2 = 0; iter2 < 100; iter2++)
                {
                    string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000));

                    if (VERBOSE)
                    {
                        Console.WriteLine("  content=" + content);
                    }

                    // Do stupid dog-slow mapping:

                    // Output string:
                    StringBuilder output = new StringBuilder();

                    // Maps output offset to input offset:
                    IList <int?> inputOffsets = new List <int?>();

                    int cumDiff = 0;
                    int charIdx = 0;
                    while (charIdx < content.Length)
                    {
                        int    matchLen  = -1;
                        string matchRepl = null;

                        foreach (KeyValuePair <string, string> ent in map)
                        {
                            string match = ent.Key;
                            if (charIdx + match.Length <= content.Length)
                            {
                                int  limit   = charIdx + match.Length;
                                bool matches = true;
                                for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++)
                                {
                                    if (match[charIdx2 - charIdx] != content[charIdx2])
                                    {
                                        matches = false;
                                        break;
                                    }
                                }

                                if (matches)
                                {
                                    string repl = ent.Value;
                                    if (match.Length > matchLen)
                                    {
                                        // Greedy: longer match wins
                                        matchLen  = match.Length;
                                        matchRepl = repl;
                                    }
                                }
                            }
                        }

                        if (matchLen != -1)
                        {
                            // We found a match here!
                            if (VERBOSE)
                            {
                                Console.WriteLine("    match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl);
                            }
                            output.Append(matchRepl);
                            int minLen = Math.Min(matchLen, matchRepl.Length);

                            // Common part, directly maps back to input
                            // offset:
                            for (int outIdx = 0; outIdx < minLen; outIdx++)
                            {
                                inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff);
                            }

                            cumDiff += matchLen - matchRepl.Length;
                            charIdx += matchLen;

                            if (matchRepl.Length < matchLen)
                            {
                                // Replacement string is shorter than matched
                                // input: nothing to do
                            }
                            else if (matchRepl.Length > matchLen)
                            {
                                // Replacement string is longer than matched
                                // input: for all the "extra" chars we map
                                // back to a single input offset:
                                for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++)
                                {
                                    inputOffsets.Add(output.Length + cumDiff - 1);
                                }
                            }
                            else
                            {
                                // Same length: no change to offset
                            }

                            Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length);
                        }
                        else
                        {
                            inputOffsets.Add(output.Length + cumDiff);
                            output.Append(content[charIdx]);
                            charIdx++;
                        }
                    }

                    string expected = output.ToString();
                    if (VERBOSE)
                    {
                        Console.Write("    expected:");
                        for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++)
                        {
                            Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]);
                        }
                        Console.WriteLine();
                    }

                    MappingCharFilter mapFilter          = new MappingCharFilter(charMap, new StringReader(content));
                    StringBuilder     actualBuilder      = new StringBuilder();
                    IList <int?>      actualInputOffsets = new List <int?>();

                    // Now consume the actual mapFilter, somewhat randomly:
                    while (true)
                    {
                        if (random.Next(0, 1) == 1)
                        {
                            int ch = mapFilter.Read();
                            if (ch == -1)
                            {
                                break;
                            }
                            actualBuilder.Append((char)ch);
                        }
                        else
                        {
                            char[] buffer = new char[TestUtil.NextInt(random, 1, 100)];
                            int    off    = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1);
                            int    count  = mapFilter.Read(buffer, off, buffer.Length - off);
                            if (count == -1)
                            {
                                break;
                            }
                            else
                            {
                                actualBuilder.Append(buffer, off, count);
                            }
                        }

                        if (random.Next(10) == 7)
                        {
                            // Map offsets
                            while (actualInputOffsets.Count < actualBuilder.Length)
                            {
                                actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                            }
                        }
                    }

                    // Finish mappping offsets
                    while (actualInputOffsets.Count < actualBuilder.Length)
                    {
                        actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count));
                    }

                    string actual = actualBuilder.ToString();

                    // Verify:
                    assertEquals(expected, actual);
                    assertEquals(inputOffsets, actualInputOffsets);
                }
            }
        }
Beispiel #16
0
 public AnalyzerAnonymousInnerClassHelper3(TestMappingCharFilter outerInstance, NormalizeCharMap map)
 {
     this.outerInstance = outerInstance;
     this.map           = map;
 }
 protected internal virtual void parseRules(IList<string> rules, NormalizeCharMap.Builder builder)
 {
     foreach (string rule in rules)
     {
       Matcher m = p.matcher(rule);
       if (!m.find())
       {
     throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
       }
       builder.add(parseString(m.group(1)), parseString(m.group(2)));
     }
 }
 public AnalyzerAnonymousInnerClassHelper3(TestMappingCharFilter outerInstance, NormalizeCharMap map)
 {
     this.outerInstance = outerInstance;
     this.map = map;
 }
 public AnalyzerAnonymousInnerClassHelper(TestCompoundWordTokenFilter outerInstance, CharArraySet dict, NormalizeCharMap normMap)
 {
     this.outerInstance = outerInstance;
     this.dict = dict;
     this.normMap = normMap;
 }
 public AnalyzerAnonymousInnerClassHelper(TestBugInSomething outerInstance, CharArraySet cas, NormalizeCharMap map)
 {
     this.outerInstance = outerInstance;
     this.cas = cas;
     this.map = map;
 }
 protected internal virtual void ParseRules(IList<string> rules, NormalizeCharMap.Builder builder)
 {
     foreach (string rule in rules)
     {
         Match m = p.Match(rule);
         if (!m.Success)
         {
             throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
         }
         builder.Add(ParseString(m.Groups[1].Value), ParseString(m.Groups[2].Value));
     }
 }
 public AnalyzerAnonymousInnerClassHelper(TestCJKAnalyzer outerInstance, NormalizeCharMap norm)
 {
     this.outerInstance = outerInstance;
     this.norm = norm;
 }