public virtual void Test() { CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("mtqlpi", ""); builder.Add("mwoknt", "jjp"); builder.Add("tcgyreo", "zpfpajyws"); NormalizeCharMap map = builder.Build(); Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer t = new MockTokenizer(new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader), MockTokenFilter.ENGLISH_STOPSET, false, -65); TokenFilter f = new CommonGramsFilter(TEST_VERSION_CURRENT, t, cas); return(new TokenStreamComponents(t, f)); }, initReader: (fieldName, reader) => { reader = new MockCharFilter(reader, 0); reader = new MappingCharFilter(map, reader); return(reader); }); CheckAnalysisConsistency(Random, a, false, "wmgddzunizdomqyj"); }
public virtual void TestNormalizeWinDelimToLinuxDelim() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("\\", "/"); NormalizeCharMap normMap = builder.Build(); string path = "c:\\a\\b\\c"; Reader cs = new MappingCharFilter(normMap, new StringReader(path)); PathHierarchyTokenizer t = new PathHierarchyTokenizer(cs); AssertTokenStreamContents(t, new string[] { "c:", "c:/a", "c:/a/b", "c:/a/b/c" }, new int[] { 0, 0, 0, 0 }, new int[] { 2, 4, 6, 8 }, new int[] { 1, 0, 0, 0 }, path.Length); }
public virtual void TestInvalidOffsets() { CharArraySet dict = makeDictionary("fall"); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("ü", "ue"); NormalizeCharMap normMap = builder.Build(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, dict, normMap); AssertAnalyzesTo(analyzer, "banküberfall", new string[] { "bankueberfall", "fall" }, new int[] { 0, 0 }, new int[] { 12, 12 }); }
public virtual void TestChangedOffsets() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("a", "一二"); builder.Add("b", "二三"); NormalizeCharMap norm = builder.Build(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, norm); AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 }); // note: offsets are strange since this is how the charfilter maps them... // before bigramming, the 4 tokens look like: // { 0, 0, 1, 1 }, // { 0, 1, 1, 2 } }
public virtual void Test() { CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("mtqlpi", ""); builder.Add("mwoknt", "jjp"); builder.Add("tcgyreo", "zpfpajyws"); NormalizeCharMap map = builder.Build(); Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, cas, map); CheckAnalysisConsistency(Random(), a, false, "wmgddzunizdomqyj"); }
public virtual void TestChangedOffsets() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("a", "一二"); builder.Add("b", "二三"); NormalizeCharMap norm = builder.Build(); Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader); return(new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer))); }, initReader: (fieldName, reader) => new MappingCharFilter(norm, reader)); AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 }); // note: offsets are strange since this is how the charfilter maps them... // before bigramming, the 4 tokens look like: // { 0, 0, 1, 1 }, // { 0, 1, 1, 2 } }
protected override TextReader InitReader(string fieldName, TextReader reader) { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // different apostrophes builder.Add("\u2019", "'"); builder.Add("\u2018", "'"); builder.Add("\u02BC", "'"); builder.Add("`", "'"); builder.Add("´", "'"); // ignored characters builder.Add("\u0301", ""); builder.Add("\u00AD", ""); builder.Add("ґ", "г"); builder.Add("Ґ", "Г"); NormalizeCharMap normMap = builder.Build(); reader = new MappingCharFilter(normMap, reader); return(reader); }
public object Create(Random random) { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // we can't add duplicate keys, or NormalizeCharMap gets angry ISet <string> keys = new HashSet <string>(); int num = random.nextInt(5); //System.out.println("NormalizeCharMap="); for (int i = 0; i < num; i++) { string key = TestUtil.RandomSimpleString(random); if (!keys.contains(key) && key.Length > 0) { string value = TestUtil.RandomSimpleString(random); builder.Add(key, value); keys.add(key); //System.out.println("mapping: '" + key + "' => '" + value + "'"); } } return(builder.Build()); }
public virtual void TestOffsetCorrection() { const string INPUT = "Günther Günther is here"; // create MappingCharFilter IList <string> mappingRules = new JCG.List <string>(); mappingRules.Add("\"ü\" => \"ü\""); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("ü", "ü"); NormalizeCharMap normMap = builder.Build(); CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); // create PatternTokenizer TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length); charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); stream = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length); }