Пример #1
0
        public virtual void Test()
        {
            CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false);

            cas.add("jjp");
            cas.add("wlmwoknt");
            cas.add("tcgyreo");

            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("mtqlpi", "");
            builder.Add("mwoknt", "jjp");
            builder.Add("tcgyreo", "zpfpajyws");
            NormalizeCharMap map = builder.Build();

            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer t   = new MockTokenizer(new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader), MockTokenFilter.ENGLISH_STOPSET, false, -65);
                TokenFilter f = new CommonGramsFilter(TEST_VERSION_CURRENT, t, cas);
                return(new TokenStreamComponents(t, f));
            }, initReader: (fieldName, reader) =>
            {
                reader = new MockCharFilter(reader, 0);
                reader = new MappingCharFilter(map, reader);
                return(reader);
            });

            CheckAnalysisConsistency(Random, a, false, "wmgddzunizdomqyj");
        }
Пример #2
0
        public virtual void TestChangedOffsets()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("a", "一二");
            builder.Add("b", "二三");
            NormalizeCharMap norm     = builder.Build();
            Analyzer         analyzer = new AnalyzerAnonymousInnerClassHelper(this, norm);

            AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 });

            // note: offsets are strange since this is how the charfilter maps them...
            // before bigramming, the 4 tokens look like:
            //   { 0, 0, 1, 1 },
            //   { 0, 1, 1, 2 }
        }
Пример #3
0
        public virtual void Test()
        {
            CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false);

            cas.add("jjp");
            cas.add("wlmwoknt");
            cas.add("tcgyreo");

            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("mtqlpi", "");
            builder.Add("mwoknt", "jjp");
            builder.Add("tcgyreo", "zpfpajyws");
            NormalizeCharMap map = builder.Build();

            Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, cas, map);

            CheckAnalysisConsistency(Random(), a, false, "wmgddzunizdomqyj");
        }
Пример #4
0
        public virtual void TestChangedOffsets()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("a", "一二");
            builder.Add("b", "二三");
            NormalizeCharMap norm     = builder.Build();
            Analyzer         analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
                return(new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer)));
            }, initReader: (fieldName, reader) => new MappingCharFilter(norm, reader));

            AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 });

            // note: offsets are strange since this is how the charfilter maps them...
            // before bigramming, the 4 tokens look like:
            //   { 0, 0, 1, 1 },
            //   { 0, 1, 1, 2 }
        }
Пример #5
0
        public virtual void TestNormalizeWinDelimToLinuxDelim()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("\\", "/");
            NormalizeCharMap       normMap = builder.Build();
            string                 path    = "c:\\a\\b\\c";
            Reader                 cs      = new MappingCharFilter(normMap, new StringReader(path));
            PathHierarchyTokenizer t       = new PathHierarchyTokenizer(cs);

            AssertTokenStreamContents(t, new string[] { "c:", "c:/a", "c:/a/b", "c:/a/b/c" }, new int[] { 0, 0, 0, 0 }, new int[] { 2, 4, 6, 8 }, new int[] { 1, 0, 0, 0 }, path.Length);
        }
        public virtual void TestInvalidOffsets()
        {
            CharArraySet dict = makeDictionary("fall");

            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("ü", "ue");
            NormalizeCharMap normMap = builder.Build();

            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, dict, normMap);

            AssertAnalyzesTo(analyzer, "banküberfall", new string[] { "bankueberfall", "fall" }, new int[] { 0, 0 }, new int[] { 12, 12 });
        }
        protected override TextReader InitReader(string fieldName, TextReader reader)
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            // different apostrophes
            builder.Add("\u2019", "'");
            builder.Add("\u2018", "'");
            builder.Add("\u02BC", "'");
            builder.Add("`", "'");
            builder.Add("´", "'");
            // ignored characters
            builder.Add("\u0301", "");
            builder.Add("\u00AD", "");
            builder.Add("ґ", "г");
            builder.Add("Ґ", "Г");

            NormalizeCharMap normMap = builder.Build();

            reader = new MappingCharFilter(normMap, reader);
            return(reader);
        }
Пример #8
0
            public object Create(Random random)
            {
                NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
                // we can't add duplicate keys, or NormalizeCharMap gets angry
                ISet <string> keys = new HashSet <string>();
                int           num  = random.nextInt(5);

                //System.out.println("NormalizeCharMap=");
                for (int i = 0; i < num; i++)
                {
                    string key = TestUtil.RandomSimpleString(random);
                    if (!keys.contains(key) && key.Length > 0)
                    {
                        string value = TestUtil.RandomSimpleString(random);
                        builder.Add(key, value);
                        keys.add(key);
                        //System.out.println("mapping: '" + key + "' => '" + value + "'");
                    }
                }
                return(builder.Build());
            }
Пример #9
0
        public virtual void TestOffsetCorrection()
        {
            const string INPUT = "G&uuml;nther G&uuml;nther is here";

            // create MappingCharFilter
            IList <string> mappingRules = new JCG.List <string>();

            mappingRules.Add("\"&uuml;\" => \"ü\"");
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("&uuml;", "ü");
            NormalizeCharMap normMap    = builder.Build();
            CharFilter       charStream = new MappingCharFilter(normMap, new StringReader(INPUT));

            // create PatternTokenizer
            TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1);

            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length);

            charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
            stream     = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0);
            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length);
        }