// TODO: this should use inputstreams from the loader, not File! public virtual void Inform(IResourceLoader loader) { if (mapping != null) { IList <string> wlist = null; if (File.Exists(mapping)) { wlist = new List <string>(GetLines(loader, mapping)); } else { var files = SplitFileNames(mapping); wlist = new List <string>(); foreach (string file in files) { var lines = GetLines(loader, file.Trim()); wlist.AddRange(lines); } } NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); ParseRules(wlist, builder); m_normMap = builder.Build(); if (m_normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null m_normMap = null; } } }
// TODO: this should use inputstreams from the loader, not File! public virtual void Inform(IResourceLoader loader) { if (mapping != null) { IList<string> wlist = null; if (File.Exists(mapping)) { wlist = new List<string>(GetLines(loader, mapping)); } else { var files = SplitFileNames(mapping); wlist = new List<string>(); foreach (string file in files) { var lines = GetLines(loader, file.Trim()); wlist.AddRange(lines); } } NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); ParseRules(wlist, builder); normMap = builder.Build(); if (normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null normMap = null; } } }
// TODO: this should use inputstreams from the loader, not File! //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException public virtual void inform(ResourceLoader loader) { if (mapping != null) { IList <string> wlist = null; File mappingFile = new File(mapping); if (mappingFile.exists()) { wlist = getLines(loader, mapping); } else { IList <string> files = splitFileNames(mapping); wlist = new List <>(); foreach (string file in files) { IList <string> lines = getLines(loader, file.Trim()); wlist.AddRange(lines); } } NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); parseRules(wlist, builder); normMap = builder.build(); if (normMap.map == null) { // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) // so just set the whole map to null normMap = null; } } }
public virtual void TestRandomMaps() { int numIterations = AtLeast(3); for (int i = 0; i < numIterations; i++) { NormalizeCharMap map = RandomMap(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper3(this, map); int numRounds = 100; CheckRandomData(Random(), analyzer, numRounds); } }
public virtual void TestFinalOffsetSpecialCase() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("t", ""); // even though this below rule has no effect, the test passes if you remove it!! builder.Add("tmakdbl", "c"); NormalizeCharMap map = builder.Build(); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this, map); string text = "gzw f quaxot"; CheckAnalysisConsistency(Random(), analyzer, false, text); }
public virtual void TestRandomMaps() { int numIterations = AtLeast(3); for (int i = 0; i < numIterations; i++) { NormalizeCharMap map = RandomMap(); Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return(new TokenStreamComponents(tokenizer, tokenizer)); }, initReader: (fieldName, reader) => new MappingCharFilter(map, reader)); int numRounds = 100; CheckRandomData(Random, analyzer, numRounds); } }
/// <summary> /// Default constructor that takes a <seealso cref="TextReader"/>. </summary> public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in) { buffer.Reset(@in); map = normMap.map; cachedRootArcs = normMap.cachedRootArcs; if (map != null) { fstReader = map.BytesReader; } else { fstReader = null; } }
public virtual void TestFinalOffsetSpecialCase() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("t", ""); // even though this below rule has no effect, the test passes if you remove it!! builder.Add("tmakdbl", "c"); NormalizeCharMap map = builder.Build(); Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); return(new TokenStreamComponents(tokenizer, tokenizer)); }, initReader: (fieldName, reader) => new MappingCharFilter(map, reader)); string text = "gzw f quaxot"; CheckAnalysisConsistency(Random, analyzer, false, text); }
/// <summary> /// Default constructor that takes a <seealso cref="TextReader"/>. </summary> public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in) { //LUCENENET support to reset the reader. _input = GetBufferedReader(@in); _input.Mark(BufferedCharFilter.defaultCharBufferSize); buffer.Reset(_input); //buffer.Reset(@in); map = normMap.map; cachedRootArcs = normMap.cachedRootArcs; if (map != null) { fstReader = map.BytesReader; } else { fstReader = null; } }
/// <summary> /// Default constructor that takes a <see cref="TextReader"/>. </summary> public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in) { //LUCENENET support to reset the reader. _input = GetBufferedReader(@in); _input.Mark(BufferedCharFilter.DEFAULT_CHAR_BUFFER_SIZE); buffer.Reset(_input); //buffer.Reset(@in); map = normMap.map; cachedRootArcs = normMap.cachedRootArcs; if (map != null) { fstReader = map.GetBytesReader(); } else { fstReader = null; } }
public override void SetUp() { base.SetUp(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("aa", "a"); builder.Add("bbb", "b"); builder.Add("cccc", "cc"); builder.Add("h", "i"); builder.Add("j", "jj"); builder.Add("k", "kkk"); builder.Add("ll", "llll"); builder.Add("empty", ""); // BMP (surrogate pair): builder.Add(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1), "fclef"); builder.Add("\uff01", "full-width-exclamation"); normMap = builder.Build(); }
public virtual void TestRandomMaps2() { Random random = Random(); int numIterations = AtLeast(3); for (int iter = 0; iter < numIterations; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST iter=" + iter); } char endLetter = (char)TestUtil.NextInt(random, 'b', 'z'); IDictionary <string, string> map = new Dictionary <string, string>(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); int numMappings = AtLeast(5); if (VERBOSE) { Console.WriteLine(" mappings:"); } while (map.Count < numMappings) { string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7); if (key.Length != 0 && !map.ContainsKey(key)) { string value = TestUtil.RandomSimpleString(random); map[key] = value; builder.Add(key, value); if (VERBOSE) { Console.WriteLine(" " + key + " -> " + value); } } } NormalizeCharMap charMap = builder.Build(); if (VERBOSE) { Console.WriteLine(" test random documents..."); } for (int iter2 = 0; iter2 < 100; iter2++) { string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000)); if (VERBOSE) { Console.WriteLine(" content=" + content); } // Do stupid dog-slow mapping: // Output string: StringBuilder output = new StringBuilder(); // Maps output offset to input offset: IList <int?> inputOffsets = new List <int?>(); int cumDiff = 0; int charIdx = 0; while (charIdx < content.Length) { int matchLen = -1; string matchRepl = null; foreach (KeyValuePair <string, string> ent in map) { string match = ent.Key; if (charIdx + match.Length <= content.Length) { int limit = charIdx + match.Length; bool matches = true; for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++) { if (match[charIdx2 - charIdx] != content[charIdx2]) { matches = false; break; } } if (matches) { string repl = ent.Value; if (match.Length > matchLen) { // Greedy: longer match wins matchLen = match.Length; matchRepl = repl; } } } } if (matchLen != -1) { // We found a match here! if (VERBOSE) { Console.WriteLine(" match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl); } output.Append(matchRepl); int minLen = Math.Min(matchLen, matchRepl.Length); // Common part, directly maps back to input // offset: for (int outIdx = 0; outIdx < minLen; outIdx++) { inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff); } cumDiff += matchLen - matchRepl.Length; charIdx += matchLen; if (matchRepl.Length < matchLen) { // Replacement string is shorter than matched // input: nothing to do } else if (matchRepl.Length > matchLen) { // Replacement string is longer than matched // input: for all the "extra" chars we map // back to a single input offset: for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++) { inputOffsets.Add(output.Length + cumDiff - 1); } } else { // Same length: no change to offset } Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length); } else { inputOffsets.Add(output.Length + cumDiff); output.Append(content[charIdx]); charIdx++; } } string expected = output.ToString(); if (VERBOSE) { Console.Write(" expected:"); for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++) { Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]); } Console.WriteLine(); } MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content)); StringBuilder actualBuilder = new StringBuilder(); IList <int?> actualInputOffsets = new List <int?>(); // Now consume the actual mapFilter, somewhat randomly: while (true) { if (random.Next(0, 1) == 1) { int ch = mapFilter.Read(); if (ch == -1) { break; } actualBuilder.Append((char)ch); } else { char[] buffer = new char[TestUtil.NextInt(random, 1, 100)]; int off = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1); int count = mapFilter.Read(buffer, off, buffer.Length - off); if (count == -1) { break; } else { actualBuilder.Append(buffer, off, count); } } if (random.Next(10) == 7) { // Map offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } } } // Finish mappping offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } string actual = actualBuilder.ToString(); // Verify: assertEquals(expected, actual); assertEquals(inputOffsets, actualInputOffsets); } } }
public AnalyzerAnonymousInnerClassHelper3(TestMappingCharFilter outerInstance, NormalizeCharMap map) { this.outerInstance = outerInstance; this.map = map; }
protected internal virtual void parseRules(IList<string> rules, NormalizeCharMap.Builder builder) { foreach (string rule in rules) { Matcher m = p.matcher(rule); if (!m.find()) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping); } builder.add(parseString(m.group(1)), parseString(m.group(2))); } }
public AnalyzerAnonymousInnerClassHelper(TestCompoundWordTokenFilter outerInstance, CharArraySet dict, NormalizeCharMap normMap) { this.outerInstance = outerInstance; this.dict = dict; this.normMap = normMap; }
public AnalyzerAnonymousInnerClassHelper(TestBugInSomething outerInstance, CharArraySet cas, NormalizeCharMap map) { this.outerInstance = outerInstance; this.cas = cas; this.map = map; }
protected internal virtual void ParseRules(IList<string> rules, NormalizeCharMap.Builder builder) { foreach (string rule in rules) { Match m = p.Match(rule); if (!m.Success) { throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping); } builder.Add(ParseString(m.Groups[1].Value), ParseString(m.Groups[2].Value)); } }
public AnalyzerAnonymousInnerClassHelper(TestCJKAnalyzer outerInstance, NormalizeCharMap norm) { this.outerInstance = outerInstance; this.norm = norm; }