/// <summary>Records a replacement to be applied to the inputs /// stream. Whenever <code>singleMatch</code> occurs in /// the input, it will be replaced with /// <code>replacement</code>. /// /// </summary> /// <param name="singleMatch">input String to be replaced /// </param> /// <param name="replacement">output String /// </param> public virtual void Add(System.String singleMatch, System.String replacement) { NormalizeCharMap currMap = this; for (int i = 0; i < singleMatch.Length; i++) { char c = singleMatch[i]; if (currMap.submap == null) { currMap.submap = new System.Collections.Hashtable(1); } NormalizeCharMap map = (NormalizeCharMap)currMap.submap[CharacterCache.ValueOf(c)]; if (map == null) { map = new NormalizeCharMap(); currMap.submap[c] = map; } currMap = map; } if (currMap.normStr != null) { throw new System.SystemException("MappingCharFilter: there is already a mapping for " + singleMatch); } currMap.normStr = replacement; currMap.diff = singleMatch.Length - replacement.Length; }
// TODO: this should use inputstreams from the loader, not File! //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException public virtual void inform(ResourceLoader loader) { if (mapping != null) { IList<string> wlist = null; File mappingFile = new File(mapping); if (mappingFile.exists()) { wlist = getLines(loader, mapping); } else { IList<string> files = splitFileNames(mapping); wlist = new List<>(); foreach (string file in files) { IList<string> lines = getLines(loader, file.Trim()); wlist.AddRange(lines); } } NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); parseRules(wlist, builder); normMap = builder.build(); if (normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null normMap = null; } } }
// TODO: this should use inputstreams from the loader, not File! //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException public virtual void inform(ResourceLoader loader) { if (mapping != null) { IList <string> wlist = null; File mappingFile = new File(mapping); if (mappingFile.exists()) { wlist = getLines(loader, mapping); } else { IList <string> files = splitFileNames(mapping); wlist = new List <>(); foreach (string file in files) { IList <string> lines = getLines(loader, file.Trim()); wlist.AddRange(lines); } } NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); parseRules(wlist, builder); normMap = builder.build(); if (normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null normMap = null; } } }
private NormalizeCharMap Match(NormalizeCharMap map) { NormalizeCharMap result = null; if (map.submap != null) { int chr = NextChar(); if (chr != -1) { NormalizeCharMap subMap = (NormalizeCharMap)map.submap[CharacterCache.ValueOf((char)chr)]; if (subMap != null) { result = Match(subMap); } if (result == null) { PushChar(chr); } } } if (result == null && map.normStr != null) { result = map; } return(result); }
public virtual void Test() { CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("mtqlpi", ""); builder.Add("mwoknt", "jjp"); builder.Add("tcgyreo", "zpfpajyws"); NormalizeCharMap map = builder.Build(); Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer t = new MockTokenizer(new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader), MockTokenFilter.ENGLISH_STOPSET, false, -65); TokenFilter f = new CommonGramsFilter(TEST_VERSION_CURRENT, t, cas); return(new TokenStreamComponents(t, f)); }, initReader: (fieldName, reader) => { reader = new MockCharFilter(reader, 0); reader = new MappingCharFilter(map, reader); return(reader); }); CheckAnalysisConsistency(Random, a, false, "wmgddzunizdomqyj"); }
/// <summary> /// http://stackoverflow.com/questions/15235126/lucene-4-1-how-split-words-that-contains-dots-when-indexing /// </summary> /// <param name="reader"></param> /// <returns></returns> private TextReader InitReader(TextReader reader) { NormalizeCharMap normalizeCharMap = new NormalizeCharMap(); foreach (string normalizeChar in this._NormalizeChars) { normalizeCharMap.Add(normalizeChar, " "); } return(new MappingCharFilter(normalizeCharMap, reader)); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testNormalizeWinDelimToLinuxDelim() throws Exception public virtual void testNormalizeWinDelimToLinuxDelim() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("\\", "/"); NormalizeCharMap normMap = builder.build(); string path = "c:\\a\\b\\c"; Reader cs = new MappingCharFilter(normMap, new StringReader(path)); PathHierarchyTokenizer t = new PathHierarchyTokenizer(cs); assertTokenStreamContents(t, new string[] { "c:", "c:/a", "c:/a/b", "c:/a/b/c" }, new int[] { 0, 0, 0, 0 }, new int[] { 2, 4, 6, 8 }, new int[] { 1, 0, 0, 0 }, path.Length); }
public virtual void TestInvalidOffsets() { CharArraySet dict = makeDictionary("fall"); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("ü", "ue"); NormalizeCharMap normMap = builder.Build(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, dict, normMap); AssertAnalyzesTo(analyzer, "banküberfall", new string[] { "bankueberfall", "fall" }, new int[] { 0, 0 }, new int[] { 12, 12 }); }
public virtual void TestChangedOffsets() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("a", "一二"); builder.Add("b", "二三"); NormalizeCharMap norm = builder.Build(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, norm); AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 }); // note: offsets are strange since this is how the charfilter maps them... // before bigramming, the 4 tokens look like: // { 0, 0, 1, 1 }, // { 0, 1, 1, 2 } }
/// <summary> /// Default constructor that takes a <seealso cref="Reader"/>. </summary> public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in) { buffer.Reset(@in); map = normMap.map; cachedRootArcs = normMap.cachedRootArcs; if (map != null) { fstReader = map.BytesReader; } else { fstReader = null; } }
public virtual void Test() { CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("mtqlpi", ""); builder.Add("mwoknt", "jjp"); builder.Add("tcgyreo", "zpfpajyws"); NormalizeCharMap map = builder.Build(); Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, cas, map); CheckAnalysisConsistency(Random(), a, false, "wmgddzunizdomqyj"); }
public virtual void TestChangedOffsets() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("a", "一二"); builder.Add("b", "二三"); NormalizeCharMap norm = builder.Build(); Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader); return(new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer))); }, initReader: (fieldName, reader) => new MappingCharFilter(norm, reader)); AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 }); // note: offsets are strange since this is how the charfilter maps them... // before bigramming, the 4 tokens look like: // { 0, 0, 1, 1 }, // { 0, 1, 1, 2 } }
/// <summary> /// test that offsets are correct when mappingcharfilter is previously applied </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testChangedOffsets() throws java.io.IOException public virtual void testChangedOffsets() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder builder = new org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("a", "一二"); builder.add("b", "二三"); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap norm = builder.build(); NormalizeCharMap norm = builder.build(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, norm); assertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 }); // note: offsets are strange since this is how the charfilter maps them... // before bigramming, the 4 tokens look like: // { 0, 0, 1, 1 }, // { 0, 1, 1, 2 } }
public override int Read() { while (true) { if (replacement != null && charPointer < replacement.Length) { return(replacement[charPointer++]); } int firstChar = NextChar(); if (firstChar == -1) { return(-1); } NormalizeCharMap nm = normMap.submap != null?(NormalizeCharMap)normMap.submap[CharacterCache.ValueOf((char)firstChar)]:null; if (nm == null) { return(firstChar); } NormalizeCharMap result = Match(nm); if (result == null) { return(firstChar); } replacement = result.normStr; charPointer = 0; if (result.diff != 0) { int prevCumulativeDiff = GetLastCumulativeDiff(); if (result.diff < 0) { for (int i = 0; i < -result.diff; i++) { AddOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i); } } else { AddOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff); } } } }
protected override TextReader InitReader(string fieldName, TextReader reader) { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // different apostrophes builder.Add("\u2019", "'"); builder.Add("\u2018", "'"); builder.Add("\u02BC", "'"); builder.Add("`", "'"); builder.Add("´", "'"); // ignored characters builder.Add("\u0301", ""); builder.Add("\u00AD", ""); builder.Add("ґ", "г"); builder.Add("Ґ", "Г"); NormalizeCharMap normMap = builder.Build(); reader = new MappingCharFilter(normMap, reader); return(reader); }
public virtual void TestOffsetCorrection() { const string INPUT = "Günther Günther is here"; // create MappingCharFilter IList <string> mappingRules = new JCG.List <string>(); mappingRules.Add("\"ü\" => \"ü\""); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("ü", "ü"); NormalizeCharMap normMap = builder.Build(); CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); // create PatternTokenizer TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length); charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); stream = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void test() throws Exception public virtual void test() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.util.CharArraySet cas = new org.apache.lucene.analysis.util.CharArraySet(TEST_VERSION_CURRENT, 3, false); CharArraySet cas = new CharArraySet(TEST_VERSION_CURRENT, 3, false); cas.add("jjp"); cas.add("wlmwoknt"); cas.add("tcgyreo"); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder builder = new org.apache.lucene.analysis.charfilter.NormalizeCharMap.Builder(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("mtqlpi", ""); builder.add("mwoknt", "jjp"); builder.add("tcgyreo", "zpfpajyws"); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.charfilter.NormalizeCharMap map = builder.build(); NormalizeCharMap map = builder.build(); Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, cas, map); checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj"); }
/// Easy-use constructor that takes a {@link Reader}. public MappingCharFilter(NormalizeCharMap normMap, System.IO.TextReader in_Renamed) : base(CharReader.Get(in_Renamed)) { this.normMap = normMap; }
public AnalyzerAnonymousInnerClassHelper(TestBugInSomething outerInstance, CharArraySet cas, NormalizeCharMap map) { this.outerInstance = outerInstance; this.cas = cas; this.map = map; }
public AnalyzerAnonymousInnerClassHelper(TestCompoundWordTokenFilter outerInstance, CharArraySet dict, NormalizeCharMap normMap) { this.outerInstance = outerInstance; this.dict = dict; this.normMap = normMap; }
protected internal virtual void parseRules(IList<string> rules, NormalizeCharMap.Builder builder) { foreach (string rule in rules) { Matcher m = p.matcher(rule); if (!m.find()) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping); } builder.add(parseString(m.group(1)), parseString(m.group(2))); } }
public AnalyzerAnonymousInnerClassHelper(TestCJKAnalyzer outerInstance, NormalizeCharMap norm) { this.outerInstance = outerInstance; this.norm = norm; }
/// Default constructor that takes a {@link CharStream}. public MappingCharFilter(NormalizeCharMap normMap, CharStream in_Renamed) : base(in_Renamed) { this.normMap = normMap; }